Files
clang-p2996/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll
Carl Ritson 86627149f6 [AMDGPU] Mitigate GFX12 VALU read SGPR hazard (#100067)
Any SGPR read by a VALU can potentially obscure SALU writes to the same
register.
Insert s_wait_alu instructions to mitigate the hazard on affected paths.

Compute a global cache of SGPRs with any VALU reads and use this to
avoid inserting mitigation for SGPRs never accessed by VALUs.

To avoid excessive search when compile time is priority implement
secondary mode where all SALU writes are mitigated.

Co-authored-by: Shilei Tian <shilei.tian@amd.com>
2024-09-04 12:15:20 +09:00

125 lines
6.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; FIXME: Check gfx90a, 940. 908 should fail to select.
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
; GFX1200: ; %bb.0:
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: s_wait_expcnt 0x0
; GFX1200-NEXT: s_wait_samplecnt 0x0
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
}
define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
; GFX1200: ; %bb.0:
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: s_wait_expcnt 0x0
; GFX1200-NEXT: s_wait_samplecnt 0x0
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s6 idxen offen
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%unused = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
; Test waterfall loop
define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 {
; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
; GFX1200: ; %bb.0:
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: s_wait_expcnt 0x0
; GFX1200-NEXT: s_wait_samplecnt 0x0
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: s_mov_b32 s2, exec_lo
; GFX1200-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB2_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_loadcnt 0x0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret <2 x bfloat> %ret
}
define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 {
; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
; GFX1200: ; %bb.0:
; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1200-NEXT: s_wait_expcnt 0x0
; GFX1200-NEXT: s_wait_samplecnt 0x0
; GFX1200-NEXT: s_wait_bvhcnt 0x0
; GFX1200-NEXT: s_wait_kmcnt 0x0
; GFX1200-NEXT: s_mov_b32 s2, exec_lo
; GFX1200-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: s_and_b32 s0, s0, s1
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
; GFX1200-NEXT: buffer_atomic_pk_add_bf16 v0, v[5:6], s[4:7], s3 idxen offen
; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1200-NEXT: ; implicit-def: $vgpr7
; GFX1200-NEXT: ; implicit-def: $vgpr0
; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX1200-NEXT: s_cbranch_execnz .LBB3_1
; GFX1200-NEXT: ; %bb.2:
; GFX1200-NEXT: s_mov_b32 exec_lo, s2
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
ret void
}
declare <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat>, ptr addrspace(8), i32, i32, i32, i32 immarg)
attributes #0 = { nounwind }