Files
clang-p2996/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
Carl Ritson 86627149f6 [AMDGPU] Mitigate GFX12 VALU read SGPR hazard (#100067)
Any SGPR read by a VALU can potentially obscure SALU writes to the same
register.
Insert s_wait_alu instructions to mitigate the hazard on affected paths.

Compute a global cache of SGPRs with any VALU reads and use this to
avoid inserting mitigation for SGPRs never accessed by VALUs.

To avoid excessive search when compile time is priority implement
secondary mode where all SALU writes are mitigated.

Co-authored-by: Shilei Tian <shilei.tian@amd.com>
2024-09-04 12:15:20 +09:00

20520 lines
931 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX7-LABEL: flat_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_unordered_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_unordered_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX7-LABEL: flat_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX7-LABEL: flat_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX7-LABEL: flat_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX7-LABEL: flat_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_unordered_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_unordered_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX7-LABEL: flat_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_release_store(
; GFX7-LABEL: flat_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_release_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_release_store:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_release_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_release_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX7-LABEL: flat_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release
ret void
}
define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire
store i32 %val, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel
store i32 %val, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst
store i32 %val, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %in, ptr %out) {
entry:
%val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4
store i32 %val, ptr %out
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX7-LABEL: flat_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_release_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_release_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr %out) {
entry:
store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[0:1], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire
store i32 %val, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel
store i32 %val, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_nop 0
; GFX10-WGP-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_nop 0
; GFX10-CU-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[6:7], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_nop 0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_nop 0
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[2:3], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[2:3], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst
store i32 %val, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s4, s8
; GFX10-WGP-NEXT: s_mov_b32 s5, s9
; GFX10-WGP-NEXT: s_mov_b32 s9, s10
; GFX10-WGP-NEXT: s_mov_b32 s8, s11
; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-WGP-NEXT: s_mov_b32 s5, s8
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[6:7]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s4, s8
; GFX10-CU-NEXT: s_mov_b32 s5, s9
; GFX10-CU-NEXT: s_mov_b32 s9, s10
; GFX10-CU-NEXT: s_mov_b32 s8, s11
; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX10-CU-NEXT: s_mov_b32 s5, s8
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[2:3]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: s_mov_b32 s6, s4
; GFX10-WGP-NEXT: s_mov_b32 s7, s5
; GFX10-WGP-NEXT: s_mov_b32 s11, s12
; GFX10-WGP-NEXT: s_mov_b32 s10, s13
; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-WGP-NEXT: s_mov_b32 s7, s10
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: s_mov_b32 s6, s4
; GFX10-CU-NEXT: s_mov_b32 s7, s5
; GFX10-CU-NEXT: s_mov_b32 s11, s12
; GFX10-CU-NEXT: s_mov_b32 s10, s13
; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX10-CU-NEXT: s_mov_b32 s7, s10
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[8:9], s[6:7]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-WGP-NEXT: s_wait_alu 0xfffe
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX12-CU-NEXT: s_wait_alu 0xfffe
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
; GFX12-CU-NEXT: s_endpgm
ptr %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr %out, i32 4
%val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr %out, align 4
ret void
}