This was only trying this to relax register class constraints, but this can also help if there are subranges involved. This solves a compilation failure for AMDGPU when there is high pressure created by large register tuples. If one virtual register is using most of the available budget, we need to be able to evict subranges. This solves the immediate failure, but this solution leaves a lot to be desired. In the relevant testcases, we have 32-element tuples but most of the uses are operations on 1 element subranges of it. What we're now getting is a spill and restore of the full 1024 bits and an extract of the used 32-bits. It would be far better if we introduced a copy to a new virtual register with a smaller register class and used narrower spills. Furthermore, we could probably do a better job if the allocator were to introduce new subranges where none previously existed in the highest pressure scenarios. The block and region splits should also try to split specific subranges out. The mve-vst3.ll test changes looks like noise to me, but instruction count increased by one. mve-vst4.ll looks like a solid improvement with several 16-byte spills eliminated. splitkit-copy-live-lanes.mir also shows a solid reduction in total spill count. This could use more tests but it's pretty tiring to come up with cases that fail on this.
95 lines
5.5 KiB
YAML
95 lines
5.5 KiB
YAML
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
|
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-regalloc -stress-regalloc=3 -start-before=greedy,1 -stop-before=virtregrewriter,1 -o - %s | FileCheck %s
|
|
---
|
|
name: split_instruction_subranges
|
|
alignment: 1
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
maxAlignment: 1
|
|
hasCalls: true
|
|
machineFunctionInfo:
|
|
maxKernArgAlign: 1
|
|
isEntryFunction: true
|
|
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
argumentInfo:
|
|
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
|
privateSegmentWaveByteOffset: { reg: '$sgpr17' }
|
|
occupancy: 8
|
|
body: |
|
|
bb.0:
|
|
; CHECK-LABEL: name: split_instruction_subranges
|
|
; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
|
|
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
|
|
; CHECK-NEXT: S_NOP 0, implicit %9.sub1
|
|
; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
|
|
; CHECK-NEXT: undef %11.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
|
|
; CHECK-NEXT: S_NOP 0, implicit %11.sub0
|
|
; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
|
|
; CHECK-NEXT: S_NOP 0, implicit %7.sub1
|
|
; CHECK-NEXT: S_ENDPGM 0
|
|
%1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
%2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
%3:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %6:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
S_NOP 0, implicit %1.sub1
|
|
S_NOP 0, implicit %2.sub0
|
|
S_NOP 0, implicit %3.sub1
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
---
|
|
name: split_instruction_subranges_use_is_subreg_def
|
|
alignment: 1
|
|
tracksRegLiveness: true
|
|
frameInfo:
|
|
maxAlignment: 1
|
|
hasCalls: true
|
|
machineFunctionInfo:
|
|
maxKernArgAlign: 1
|
|
isEntryFunction: true
|
|
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
argumentInfo:
|
|
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
|
privateSegmentWaveByteOffset: { reg: '$sgpr17' }
|
|
occupancy: 8
|
|
body: |
|
|
bb.0:
|
|
; CHECK-LABEL: name: split_instruction_subranges_use_is_subreg_def
|
|
; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5)
|
|
; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
|
|
; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
|
|
; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5)
|
|
; CHECK-NEXT: undef %13.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
|
|
; CHECK-NEXT: S_NOP 0, implicit-def %13.sub1
|
|
; CHECK-NEXT: undef %15.sub0:vreg_64 = COPY %13.sub0
|
|
; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
|
|
; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1
|
|
; CHECK-NEXT: S_NOP 0, implicit-def %7.sub0
|
|
; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY %7.sub1
|
|
; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
|
|
; CHECK-NEXT: undef %14.sub0:vreg_64 = COPY %15.sub0
|
|
; CHECK-NEXT: S_NOP 0, implicit %14.sub0
|
|
; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY %9.sub1
|
|
; CHECK-NEXT: S_NOP 0, implicit %8.sub1
|
|
; CHECK-NEXT: S_ENDPGM 0
|
|
%1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
%2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
%3:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %6:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
|
|
S_NOP 0, implicit-def %1.sub0
|
|
S_NOP 0, implicit-def %2.sub1
|
|
S_NOP 0, implicit-def %3.sub0
|
|
S_NOP 0, implicit %1.sub1
|
|
S_NOP 0, implicit %2.sub0
|
|
S_NOP 0, implicit %3.sub1
|
|
S_ENDPGM 0
|
|
|
|
...
|