Using a BufferSize of one for memory ProcResources will result in better ILP since it more accurately models the dependencies between memory ops and their consumers on an in-order processor. After this change, the scheduler will treat the data edges from loads as blocking so that stalls are guaranteed when waiting for data to be retreaved from memory. Since we don't actually track waitcnt here, this should do a better job at modeling their behavior. Practically, this means that the scheduler will trigger the 'STALL' heuristic more often. This type of change needs to be evaluated experimentally. Preliminary results are positive. Fixes: SWDEV-282962 Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D114777
8255 lines
361 KiB
LLVM
8255 lines
361 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
|
|
|
|
define amdgpu_kernel void @flat_singlethread_unordered_load(
|
|
; GFX7-LABEL: flat_singlethread_unordered_load:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_unordered_load:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_load:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
|
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %in, i32* %out) {
|
|
entry:
|
|
%val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
|
|
store i32 %val, i32* %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_load(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_load:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_load:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
|
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %in, i32* %out) {
|
|
entry:
|
|
%val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
|
|
store i32 %val, i32* %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_load(
|
|
; GFX7-LABEL: flat_singlethread_acquire_load:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_load:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_load:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
|
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %in, i32* %out) {
|
|
entry:
|
|
%val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
|
|
store i32 %val, i32* %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_load(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_load:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_load:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
|
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %in, i32* %out) {
|
|
entry:
|
|
%val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
|
|
store i32 %val, i32* %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_unordered_store(
|
|
; GFX7-LABEL: flat_singlethread_unordered_store:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_unordered_store:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_unordered_store:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32 %in, i32* %out) {
|
|
entry:
|
|
store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_store(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_store:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_store:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32 %in, i32* %out) {
|
|
entry:
|
|
store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_release_store(
|
|
; GFX7-LABEL: flat_singlethread_release_store:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_release_store:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_release_store:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_store:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32 %in, i32* %out) {
|
|
entry:
|
|
store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_store(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_store:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_store:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32 %in, i32* %out) {
|
|
entry:
|
|
store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_release_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_ret_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
|
|
store i32 %val, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
|
|
store i32 %val, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
|
|
store i32 %val, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
|
|
; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_load:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
|
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %in, i32* %out) {
|
|
entry:
|
|
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4
|
|
store i32 %val, i32* %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_load:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
|
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %in, i32* %out) {
|
|
entry:
|
|
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4
|
|
store i32 %val, i32* %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_load:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
|
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %in, i32* %out) {
|
|
entry:
|
|
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4
|
|
store i32 %val, i32* %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_load:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
|
|
; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %in, i32* %out) {
|
|
entry:
|
|
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4
|
|
store i32 %val, i32* %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
|
|
; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_unordered_store:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32 %in, i32* %out) {
|
|
entry:
|
|
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_store:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32 %in, i32* %out) {
|
|
entry:
|
|
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_release_store(
|
|
; GFX7-LABEL: flat_singlethread_one_as_release_store:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_store:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32 %in, i32* %out) {
|
|
entry:
|
|
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_store:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32 %in, i32* %out) {
|
|
entry:
|
|
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
|
|
store i32 %val, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
|
|
store i32 %val, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in) {
|
|
entry:
|
|
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
|
|
store i32 %val, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s0, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
|
|
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX7-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX10-WGP: ; %bb.0: ; %entry
|
|
; GFX10-WGP-NEXT: s_clause 0x1
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-WGP-NEXT: s_endpgm
|
|
;
|
|
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX10-CU: ; %bb.0: ; %entry
|
|
; GFX10-CU-NEXT: s_clause 0x1
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-CU-NEXT: s_add_u32 s4, s0, 16
|
|
; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX10-CU-NEXT: s_endpgm
|
|
;
|
|
; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
|
|
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
|
|
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16
|
|
; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
|
|
; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
|
|
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
|
|
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
|
|
; SKIP-CACHE-INV-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
|
|
;
|
|
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
|
|
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
|
|
; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
|
|
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
|
|
; GFX90A-TGSPLIT-NEXT: s_endpgm
|
|
i32* %out, i32 %in, i32 %old) {
|
|
entry:
|
|
%gep = getelementptr i32, i32* %out, i32 4
|
|
%val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst
|
|
%val0 = extractvalue { i32, i1 } %val, 0
|
|
store i32 %val0, i32* %out, align 4
|
|
ret void
|
|
}
|
|
|