Allocating wwm-registers and per-thread VGPR operands together imposes many challenges in the way the registers are reused during allocation. There are times when regalloc reuses the registers of regular VGPRs operations for wwm-operations in a small range leading to unwantedly clobbering their inactive lanes causing correctness issues that are hard to trace. This patch splits the VGPR allocation pipeline further to allocate wwm-registers first and the regular VGPR operands in a separate pipeline. The splitting would ensure that the physical registers used for wwm allocations won't take part in the next allocation pipeline to avoid any such clobbering.
246 lines
12 KiB
YAML
246 lines
12 KiB
YAML
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
|
|
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog -o - %s | FileCheck -check-prefix=GCN %s
|
|
|
|
# We're keeping the IR around for the callees and the CCs
|
|
|
|
--- |
|
|
declare amdgpu_cs_chain void @callee()
|
|
declare amdgpu_gfx void @gfx_callee()
|
|
|
|
define amdgpu_cs_chain void @preserve_inactive_wwm() {ret void}
|
|
define amdgpu_cs_chain void @preserve_inactive_detected_wwm() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_wwm_if_no_chain_calls() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_wwm_if_init_whole_wave() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_non_wwm() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_v0_v7() {ret void}
|
|
define amdgpu_cs_chain void @dont_preserve_sgpr() {ret void}
|
|
...
|
|
---
|
|
|
|
# Check that we preserve the inactive lanes of registers v8+ received in the
|
|
# MachineFunctionInfo as wwmReservedRegs.
|
|
|
|
---
|
|
name: preserve_inactive_wwm
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
wwmReservedRegs:
|
|
- '$vgpr8'
|
|
- '$vgpr9'
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
|
|
|
; GCN-LABEL: name: preserve_inactive_wwm
|
|
; GCN: liveins: $sgpr0, $sgpr35
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
|
|
...
|
|
|
|
# Check that it also works for SGPR to VGPR spills.
|
|
|
|
---
|
|
name: preserve_inactive_detected_wwm
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
|
|
|
; GCN-LABEL: name: preserve_inactive_detected_wwm
|
|
; GCN: liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
|
|
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
|
|
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
|
|
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
|
|
; GCN-NEXT: $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
|
|
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
|
|
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0
|
|
; GCN-NEXT: renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
|
|
$sgpr35 = S_MOV_B32 5
|
|
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
|
|
renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
|
|
renamable $vgpr9 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr9
|
|
$sgpr35 = S_MOV_B32 5
|
|
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr9, 0
|
|
renamable $vgpr9 = V_MOV_B32_e32 10, implicit $exec
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_wwm_if_no_chain_calls
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: false
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
wwmReservedRegs:
|
|
- '$vgpr9'
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr35, $vgpr8
|
|
|
|
; GCN-LABEL: name: dont_preserve_wwm_if_no_chain_calls
|
|
; GCN: liveins: $sgpr35, $vgpr8
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
|
|
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
|
|
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
|
|
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
|
|
; GCN-NEXT: S_ENDPGM 0
|
|
renamable $vgpr8 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr8
|
|
$sgpr35 = S_MOV_B32 5
|
|
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr8, 0
|
|
renamable $vgpr8 = V_MOV_B32_e32 10, implicit $exec
|
|
S_ENDPGM 0
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_wwm_if_init_whole_wave
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
wwmReservedRegs:
|
|
- '$vgpr8'
|
|
- '$vgpr9'
|
|
hasInitWholeWave: true
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr8, $vgpr9
|
|
|
|
; GCN-LABEL: name: dont_preserve_wwm_if_init_whole_wave
|
|
; GCN: liveins: $sgpr0, $sgpr35
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr1
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_non_wwm
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
isChainFunction: true
|
|
returnsVoid: true
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16
|
|
|
|
; GCN-LABEL: name: dont_preserve_non_wwm
|
|
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr8, $vgpr16
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec
|
|
; GCN-NEXT: renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
renamable $vgpr16 = V_MOV_B32_e32 16, implicit $exec
|
|
renamable $vgpr8 = V_MOV_B32_e32 8, implicit $exec
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8
|
|
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_v0_v7
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
isChainFunction: true
|
|
returnsVoid: true
|
|
wwmReservedRegs:
|
|
- '$vgpr1'
|
|
body: |
|
|
bb.0:
|
|
liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9
|
|
|
|
; GCN-LABEL: name: dont_preserve_v0_v7
|
|
; GCN: liveins: $sgpr0, $sgpr35, $vgpr0, $vgpr7, $vgpr8, $vgpr9
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
|
|
; GCN-NEXT: $sgpr35 = S_MOV_B32 5
|
|
; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
|
|
; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
|
|
; GCN-NEXT: renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec
|
|
; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr0
|
|
; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr7
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
|
|
renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
|
|
$sgpr35 = S_MOV_B32 5
|
|
$sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
|
|
renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
|
|
renamable $vgpr7 = V_MOV_B32_e32 16, implicit $exec
|
|
renamable $vgpr8 = COPY killed renamable $vgpr0
|
|
renamable $vgpr9 = COPY killed renamable $vgpr7
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0, implicit $vgpr8, implicit $vgpr9
|
|
|
|
...
|
|
|
|
---
|
|
name: dont_preserve_sgpr
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
hasTailCall: true
|
|
machineFunctionInfo:
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
returnsVoid: true
|
|
body: |
|
|
bb.0 (%ir-block.0):
|
|
liveins: $sgpr0
|
|
|
|
; GCN-LABEL: name: dont_preserve_sgpr
|
|
; GCN: liveins: $sgpr0
|
|
; GCN-NEXT: {{ $}}
|
|
; GCN-NEXT: renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc
|
|
; GCN-NEXT: $sgpr0 = COPY killed renamable $sgpr1
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
; GCN-NEXT: renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
; GCN-NEXT: SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0
|
|
renamable $sgpr1 = S_ADD_I32 killed renamable $sgpr0, renamable $sgpr0, implicit-def dead $scc
|
|
$sgpr0 = COPY killed renamable $sgpr1
|
|
renamable $sgpr4_sgpr5 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def dead $scc
|
|
renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
|
|
SI_CS_CHAIN_TC_W32 killed renamable $sgpr4_sgpr5, @callee, 0, -1, amdgpu_allvgprs, implicit $sgpr0
|
|
|
|
...
|