To reduce the register pressure during allocation, when the allocator spills a virtual register that corresponds to a whole wave mode operation, the spill loads and restores should be activated for all lanes by temporarily flipping all bits in exec register to one just before the spills. It is not implemented in the compiler as of today and this patch enables the necessary support. This is a pre-patch before the SGPR spill to virtual VGPR lanes that would eventually causes the whole wave register spills during allocation. Reviewed By: arsenm, cdevadas Differential Revision: https://reviews.llvm.org/D143759
641 lines
28 KiB
LLVM
641 lines
28 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefix=MUBUF %s
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefix=FLATSCR %s
|
|
|
|
; Test that the VGPR spiller correctly switches to SGPR offsets when the
|
|
; instruction offset field would overflow, and that it accounts for memory
|
|
; swizzling.
|
|
|
|
define amdgpu_kernel void @test_inst_offset_kernel() {
|
|
; MUBUF-LABEL: test_inst_offset_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_inst_offset_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:8
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
|
|
; the instruction offset field.
|
|
%alloca = alloca i8, i32 4088, align 4, addrspace(5)
|
|
|
|
%aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
|
|
|
|
%a = load volatile i32, ptr addrspace(5) %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
%outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
store volatile i32 %a, ptr addrspace(5) %outptr
|
|
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_sgpr_offset_kernel() {
|
|
; MUBUF-LABEL: test_sgpr_offset_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_mov_b32 s4, 0x40000
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:8
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
|
; fit in the instruction, and has to live in the SGPR offset.
|
|
%alloca = alloca i8, i32 4092, align 4, addrspace(5)
|
|
|
|
%aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
; 0x40000 / 64 = 4096 (for wave64)
|
|
%a = load volatile i32, ptr addrspace(5) %aptr
|
|
; Force %a to spill
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
%outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
store volatile i32 %a, ptr addrspace(5) %outptr
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
|
|
; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_func:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_func:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_add_i32 s8, s32, 0x1004
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_add_i32 s8, s32, 0x1004
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
|
; fit in the instruction, and has to live in the SGPR offset.
|
|
%alloca = alloca i8, i32 4096, align 4, addrspace(5)
|
|
|
|
%aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
|
|
%asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
|
%asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
|
|
%asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
|
|
%asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
|
|
%asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
|
|
%asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
|
|
%asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
|
|
%asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
|
|
%asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
|
|
|
|
; 0x40000 / 64 = 4096 (for wave64)
|
|
%a = load volatile i32, ptr addrspace(5) %aptr
|
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
|
|
|
|
%asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
|
%asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
|
|
%asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
|
|
%asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
|
|
%asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
|
|
%asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
|
|
%asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
|
|
%asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
|
|
%asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
|
|
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
|
|
; Force %a to spill with no free SGPRs
|
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
|
|
; MUBUF-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_mov_b32 s10, 0x40100
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_function_scavenge_fail_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 s8, 0
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s8 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s8, 0x1004
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s8 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s8 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
|
; fit in the instruction, and has to live in the SGPR offset.
|
|
%alloca = alloca i8, i32 4096, align 4, addrspace(5)
|
|
|
|
%aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
|
|
%asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
|
%asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
|
|
%asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
|
|
%asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
|
|
%asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
|
|
%asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
|
|
%asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
|
|
%asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
|
|
%asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
|
|
|
|
; 0x40000 / 64 = 4096 (for wave64)
|
|
%a = load volatile i32, ptr addrspace(5) %aptr
|
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
|
|
|
|
%asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
|
|
%asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
|
|
%asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
|
|
%asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
|
|
%asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
|
|
%asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
|
|
%asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
|
|
%asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
|
|
%asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
|
|
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
|
|
; Force %a to spill with no free SGPRs
|
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
|
|
; MUBUF-LABEL: test_sgpr_offset_subregs_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4088 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4092 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; v[0:1]
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_subregs_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xff8
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; v[0:1]
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
|
|
; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
|
|
; the instruction offset field.
|
|
%alloca = alloca i8, i32 4084, align 4, addrspace(5)
|
|
%aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
|
|
%a = load volatile <2 x i32>, ptr addrspace(5) %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
; Ensure the alloca sticks around.
|
|
%bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
%b = load volatile i32, ptr addrspace(5) %bptr
|
|
|
|
; Ensure the spill is of the full super-reg.
|
|
call void asm sideeffect "; $0", "r"(<2 x i32> %a)
|
|
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
|
|
; MUBUF-LABEL: test_inst_offset_subregs_kernel:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_add_u32 s0, s0, s7
|
|
; MUBUF-NEXT: s_addc_u32 s1, s1, 0
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_mov_b32 s4, 0x3ff00
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; v[0:1]
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_endpgm
|
|
;
|
|
; FLATSCR-LABEL: test_inst_offset_subregs_kernel:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s0, s3
|
|
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:12 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; 8-byte Folded Spill
|
|
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_movk_i32 s0, 0xffc
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 ; 8-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; v[0:1]
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_endpgm
|
|
entry:
|
|
; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
|
|
; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
|
|
; in the SGPR offset.
|
|
%alloca = alloca i8, i32 4088, align 4, addrspace(5)
|
|
|
|
; 0x3ff00 / 64 = 4092 (for wave64)
|
|
%aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
|
|
%a = load volatile <2 x i32>, ptr addrspace(5) %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
; Ensure the alloca sticks around.
|
|
%bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
%b = load volatile i32, ptr addrspace(5) %bptr
|
|
|
|
; Ensure the spill is of the full super-reg.
|
|
call void asm sideeffect "; $0", "r"(<2 x i32> %a)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_inst_offset_function() {
|
|
; MUBUF-LABEL: test_inst_offset_function:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_inst_offset_function:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4088 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4088 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; Occupy enough bytes of scratch, so the offset of the spill of %a
|
|
; just fits in the instruction offset field when the emergency stack
|
|
; slot is added. It's hard to hit the actual limit since we're also
|
|
; going to insert the emergency stack slot for large frames.
|
|
%alloca = alloca i8, i32 4088, align 4, addrspace(5)
|
|
|
|
%aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
|
|
|
|
%a = load volatile i32, ptr addrspace(5) %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
%outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
store volatile i32 %a, ptr addrspace(5) %outptr
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_sgpr_offset_function() {
|
|
; MUBUF-LABEL: test_sgpr_offset_function:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_add_i32 s4, s32, 0x40100
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_add_i32 s4, s32, 0x40100
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_function:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1004
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; 4-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_add_i32 s0, s32, 0x1004
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s0 ; 4-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
|
|
; fit in the instruction, and has to live in the SGPR offset.
|
|
%alloca = alloca i8, i32 4096, align 4, addrspace(5)
|
|
|
|
%aptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
; 0x40000 / 64 = 4096 (for wave64)
|
|
%a = load volatile i32, ptr addrspace(5) %aptr
|
|
|
|
; Force %a to spill
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
%outptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
store volatile i32 %a, ptr addrspace(5) %outptr
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_sgpr_offset_subregs_function() {
|
|
; MUBUF-LABEL: test_sgpr_offset_subregs_function:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4084 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4088 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; v[0:1]
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_sgpr_offset_subregs_function:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:4084 ; 8-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:4 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4084 ; 8-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; v[0:1]
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; We want to test the spill of the last subreg of %a is the highest
|
|
; valid value for the immediate offset. We enable the emergency
|
|
; stack slot for large frames, so it's hard to get the frame layout
|
|
; exactly as we want to test it.
|
|
; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a
|
|
; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
|
|
; the instruction offset field.
|
|
%alloca = alloca i8, i32 4084, align 4, addrspace(5)
|
|
%aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
|
|
%a = load volatile <2 x i32>, ptr addrspace(5) %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
; Ensure the alloca sticks around.
|
|
%bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
%b = load volatile i32, ptr addrspace(5) %bptr
|
|
|
|
; Ensure the spill is of the full super-reg.
|
|
call void asm sideeffect "; $0", "r"(<2 x i32> %a)
|
|
|
|
ret void
|
|
}
|
|
|
|
define void @test_inst_offset_subregs_function() {
|
|
; MUBUF-LABEL: test_inst_offset_subregs_function:
|
|
; MUBUF: ; %bb.0: ; %entry
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00
|
|
; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Spill
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_add_i32 s4, s32, 0x3ff00
|
|
; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s4 offset:4 ; 4-byte Folded Reload
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; v[0:1]
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: test_inst_offset_subregs_function:
|
|
; FLATSCR: ; %bb.0: ; %entry
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:12 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:4092 ; 8-byte Folded Spill
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: scratch_load_dword v0, off, s32 offset:8 glc
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s32 offset:4092 ; 8-byte Folded Reload
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; v[0:1]
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
|
|
; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
|
|
; in the SGPR offset.
|
|
%alloca = alloca i8, i32 4088, align 4, addrspace(5)
|
|
|
|
; 0x3ff0000 / 64 = 4092 (for wave64)
|
|
%aptr = getelementptr <2 x i32>, ptr addrspace(5) %alloca, i32 1
|
|
%a = load volatile <2 x i32>, ptr addrspace(5) %aptr
|
|
|
|
; Force %a to spill.
|
|
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
|
|
|
|
; Ensure the alloca sticks around.
|
|
%bptr = getelementptr i32, ptr addrspace(5) %alloca, i32 1
|
|
%b = load volatile i32, ptr addrspace(5) %bptr
|
|
|
|
; Ensure the spill is of the full super-reg.
|
|
call void asm sideeffect "; $0", "r"(<2 x i32> %a)
|
|
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
|
|
attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }
|
|
attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" }
|