[AMDGPU][True16][CodeGen] atomic load/store i8 in true16 mode (#143044)

Follow up patch from https://github.com/llvm/llvm-project/pull/142822.

Update other 16bit atomic load/store pattern in t16 mode
This commit is contained in:
Brox Chen
2025-06-09 15:05:06 -04:00
committed by GitHub
parent 78af498035
commit 8380a5500b
3 changed files with 2896 additions and 88 deletions

View File

@@ -1537,17 +1537,13 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
let OtherPredicates = [HasFlatAddressSpace] in {
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
@@ -1560,8 +1556,14 @@ let True16Predicate = p in {
def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
}
let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in {
@@ -1569,8 +1571,14 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>;
def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>;
def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>;
} // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts
def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>;
@@ -1599,9 +1607,7 @@ def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
foreach as = [ "flat", "global" ] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@@ -1680,9 +1686,7 @@ let OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_aext_16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i32>;
@@ -1702,6 +1706,8 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, load_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, atomic_load_sext_8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
}
let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
@@ -1712,8 +1718,12 @@ defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", load_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_aext_8_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", atomic_load_zext_8_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", atomic_load_sext_8_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_nonext_16_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SHORT_D16", atomic_load_zext_16_global, i16>;
defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", truncstorei8_global, i16>;
defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", store_global, i16>;
defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_BYTE", atomic_store_8_global, i16>;
defm : GlobalFLATStorePats_D16_t16<"GLOBAL_STORE_SHORT", atomic_store_16_global, i16>;
} // end OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts
foreach vt = Reg32Types.types in {
@@ -1747,6 +1757,8 @@ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in {
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
}
let OtherPredicates = [HasD16LoadStore] in {
@@ -1772,9 +1784,7 @@ defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>
}
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;

File diff suppressed because it is too large Load Diff

View File

@@ -7589,15 +7589,26 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out)
; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:16
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_store_i8_offset:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] offset:16
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: atomic_store_i8_offset:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b8 v1, v0, s[0:1] offset:16
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: atomic_store_i8_offset:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1] offset:16
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(1) %out, i64 16
store atomic i8 %in, ptr addrspace(1) %gep seq_cst, align 1
@@ -7637,15 +7648,26 @@ define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) {
; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_store_i8:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: atomic_store_i8:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b8 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: atomic_store_i8:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1
ret void
@@ -7700,7 +7722,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -7778,7 +7800,7 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -7838,15 +7860,26 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %ou
; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_store_i16_offset:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: atomic_store_i16_offset:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:16
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: atomic_store_i16_offset:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] offset:16
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr addrspace(1) %out, i64 8
store atomic i16 %in, ptr addrspace(1) %gep seq_cst, align 2
@@ -7886,15 +7919,26 @@ define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_store_i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: atomic_store_i16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: atomic_store_i16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2
ret void
@@ -7935,15 +7979,26 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %o
; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_store_f16_offset:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: atomic_store_f16_offset:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:16
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: atomic_store_f16_offset:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] offset:16
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%gep = getelementptr half, ptr addrspace(1) %out, i64 8
store atomic half %in, ptr addrspace(1) %gep seq_cst, align 2
@@ -7983,15 +8038,26 @@ define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_store_f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: atomic_store_f16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: atomic_store_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
store atomic half %in, ptr addrspace(1) %out seq_cst, align 2
ret void
@@ -8032,15 +8098,26 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr addrspace(1)
; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:16
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_store_bf16_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:16
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: atomic_store_bf16_offset:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] offset:16
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: atomic_store_bf16_offset:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] offset:16
; GFX11-FAKE16-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr addrspace(1) %out, i64 8
store atomic bfloat %in, ptr addrspace(1) %gep seq_cst, align 2
ret void
@@ -8079,15 +8156,26 @@ define amdgpu_kernel void @atomic_store_bf16(bfloat %in, ptr addrspace(1) %out)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: atomic_store_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
; GFX11-TRUE16-LABEL: atomic_store_bf16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: atomic_store_bf16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_clause 0x1
; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x24
; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x2c
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
store atomic bfloat %in, ptr addrspace(1) %out seq_cst, align 2
ret void
}
@@ -9099,7 +9187,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9176,7 +9264,7 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9249,7 +9337,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:16 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv
@@ -9326,7 +9414,7 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_u16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] offset:-512 glc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: buffer_gl1_inv
; GFX11-TRUE16-NEXT: buffer_gl0_inv