The premise here is to allow non-kernel functions to locate external LDS variables without using LDS or extra magic SGPRs to do so. 1/ First it crawls the callgraph to work out which external LDS variables are reachable from a given kernel 2/ Then it creates a new `extern char[0]` variable for each kernel, which will alias all the other extern LDS variables because that's the documented behaviour of these variables 3/ The address of that variable is written to a lookup table. The global variable is tagged with metadata to track what address it was allocated at by codegen 4/ The assembler builds the lookup table using the metadata 5/ Any non-kernel functions use the same magic intrinsic used by table lookups of non-dynamic LDS variables to find the address to use Heavy overlap with the code paths taken for other lowering, in particular the same intrinsic is used to pass the dynamic scope information through the same sgpr as for table lookups of static LDS. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D144233
606 lines
24 KiB
LLVM
606 lines
24 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
|
|
|
|
; LDS is allocated per-kernel. Module scope variables are gathered into a struct which is
|
|
; allocated at address zero, if used by the kernel. Kernel scope variables are gathered into
|
|
; a per-kernel struct and allocated immediately after the module scope.
|
|
; This test checks that the module and kernel scope variables are allocated in deterministic
|
|
; order without spurious alignment padding between the two
|
|
|
|
; External LDS is checked because it influences LDS padding in general and because it will
|
|
; not be moved into either module or kernel struct
|
|
|
|
@module_variable = addrspace(3) global i16 undef
|
|
|
|
; Variables are allocated into module scope block when used by a non-kernel function
|
|
define void @use_module() #0 {
|
|
; CHECK-LABEL: use_module:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: ds_write_b16 v0, v0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
store i16 0, ptr addrspace(3) @module_variable
|
|
ret void
|
|
}
|
|
|
|
; Variables only used by kernels are specialised and allocated per-kernel
|
|
@kernel_normal = addrspace(3) global i16 undef
|
|
@kernel_overalign = addrspace(3) global i16 undef, align 4
|
|
|
|
; External LDS shall not introduce padding between module and kernel scope variables
|
|
@extern_normal = external addrspace(3) global [0 x float]
|
|
@extern_overalign = external addrspace(3) global [0 x float], align 8
|
|
|
|
|
|
; External LDS does not influence the frame when called indirectly either
|
|
define void @use_extern_normal() #0 {
|
|
; CHECK-LABEL: use_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; CHECK-NEXT: s_getpc_b64 s[6:7]
|
|
; CHECK-NEXT: s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table@rel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table@rel32@hi+12
|
|
; CHECK-NEXT: s_mov_b32 s4, s15
|
|
; CHECK-NEXT: s_ashr_i32 s5, s15, 31
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4048f5c3
|
|
; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; CHECK-NEXT: s_add_u32 s4, s4, s6
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, s7
|
|
; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s4
|
|
; CHECK-NEXT: ds_write_b32 v1, v0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 0
|
|
store float 0x40091EB860000000, ptr addrspace(3) %arrayidx
|
|
ret void
|
|
}
|
|
|
|
define void @use_extern_overalign() #0 {
|
|
; CHECK-LABEL: use_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; CHECK-NEXT: s_getpc_b64 s[6:7]
|
|
; CHECK-NEXT: s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table@rel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table@rel32@hi+12
|
|
; CHECK-NEXT: s_mov_b32 s4, s15
|
|
; CHECK-NEXT: s_ashr_i32 s5, s15, 31
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0x42280000
|
|
; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
|
|
; CHECK-NEXT: s_add_u32 s4, s4, s6
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, s7
|
|
; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s4
|
|
; CHECK-NEXT: ds_write_b32 v1, v0 offset:4
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
%arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 1
|
|
store float 4.200000e+01, ptr addrspace(3) %arrayidx
|
|
ret void
|
|
}
|
|
|
|
|
|
; First 2^3 of 2^4 cases encoded into function names
|
|
; no use of extern variable from nested function
|
|
; module_variable used/not-used
|
|
; kernel variable normal/overaligned
|
|
; extern variable normal/overaligned
|
|
|
|
define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 {
|
|
; CHECK-LABEL: module_0_kernel_normal_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 2
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_lshl_b32 s0, s0, 2
|
|
; CHECK-NEXT: s_add_i32 s0, s0, 4
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b32 v2, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
store i16 2, ptr addrspace(3) @kernel_normal
|
|
|
|
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx
|
|
store float 0.0, ptr addrspace(3) %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
|
|
; CHECK-LABEL: module_1_kernel_normal_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_getpc_b64 s[8:9]
|
|
; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
|
|
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
|
|
; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
|
|
; CHECK-NEXT: s_lshl_b32 s4, s12, 2
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
|
; CHECK-NEXT: s_add_i32 s4, s4, 4
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 2
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, s4
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b16 v0, v2 offset:2
|
|
; CHECK-NEXT: ds_write_b32 v3, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
call void @use_module()
|
|
store i16 1, ptr addrspace(3) @module_variable
|
|
|
|
store i16 2, ptr addrspace(3) @kernel_normal
|
|
|
|
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx
|
|
store float 0.0, ptr addrspace(3) %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1 {
|
|
; CHECK-LABEL: module_0_kernel_overalign_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 2
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_lshl_b32 s0, s0, 2
|
|
; CHECK-NEXT: s_add_i32 s0, s0, 4
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b32 v2, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
store i16 2, ptr addrspace(3) @kernel_overalign
|
|
|
|
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx
|
|
store float 0.0, ptr addrspace(3) %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
|
|
; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_getpc_b64 s[8:9]
|
|
; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
|
|
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
|
|
; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
|
|
; CHECK-NEXT: s_lshl_b32 s4, s12, 2
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
|
; CHECK-NEXT: s_add_i32 s4, s4, 8
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 2
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, s4
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b16 v0, v2 offset:4
|
|
; CHECK-NEXT: ds_write_b32 v3, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
call void @use_module()
|
|
store i16 1, ptr addrspace(3) @module_variable
|
|
|
|
store i16 2, ptr addrspace(3) @kernel_overalign
|
|
|
|
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx
|
|
store float 0.0, ptr addrspace(3) %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1 {
|
|
; CHECK-LABEL: module_0_kernel_normal_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 2
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_lshl_b32 s0, s0, 2
|
|
; CHECK-NEXT: s_add_i32 s0, s0, 8
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b32 v2, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
store i16 2, ptr addrspace(3) @kernel_normal
|
|
|
|
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx
|
|
store float 0.0, ptr addrspace(3) %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
|
|
; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_getpc_b64 s[8:9]
|
|
; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
|
|
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
|
|
; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
|
|
; CHECK-NEXT: s_lshl_b32 s4, s12, 2
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
|
; CHECK-NEXT: s_add_i32 s4, s4, 8
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 2
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, s4
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b16 v0, v2 offset:2
|
|
; CHECK-NEXT: ds_write_b32 v3, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
call void @use_module()
|
|
store i16 1, ptr addrspace(3) @module_variable
|
|
|
|
store i16 2, ptr addrspace(3) @kernel_normal
|
|
|
|
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx
|
|
store float 0.0, ptr addrspace(3) %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) #1 {
|
|
; CHECK-LABEL: module_0_kernel_overalign_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 2
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_lshl_b32 s0, s0, 2
|
|
; CHECK-NEXT: s_add_i32 s0, s0, 8
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b32 v2, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
store i16 2, ptr addrspace(3) @kernel_overalign
|
|
|
|
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx
|
|
store float 0.0, ptr addrspace(3) %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
|
|
; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_getpc_b64 s[8:9]
|
|
; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12
|
|
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
|
|
; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
|
|
; CHECK-NEXT: s_lshl_b32 s4, s12, 2
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
|
; CHECK-NEXT: s_add_i32 s4, s4, 8
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 2
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, s4
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b16 v0, v2 offset:4
|
|
; CHECK-NEXT: ds_write_b32 v3, v0
|
|
; CHECK-NEXT: s_endpgm
|
|
call void @use_module()
|
|
store i16 1, ptr addrspace(3) @module_variable
|
|
|
|
store i16 2, ptr addrspace(3) @kernel_overalign
|
|
|
|
%arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx
|
|
store float 0.0, ptr addrspace(3) %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
|
|
;; Second 2^3 of 2^4 cases encoded into function names
|
|
; with extern variable from nested function
|
|
; module_variable used/not-used
|
|
; kernel variable normal/overaligned
|
|
; extern variable normal/overaligned
|
|
|
|
define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) #1 {
|
|
; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_getpc_b64 s[6:7]
|
|
; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 2
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_mov_b32 s15, 0
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
|
; CHECK-NEXT: s_endpgm
|
|
store i16 2, ptr addrspace(3) @kernel_normal
|
|
|
|
call void @use_extern_normal()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) {
|
|
; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
|
; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
|
|
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
|
; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 2
|
|
; CHECK-NEXT: s_mov_b32 s15, 4
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b16 v0, v2 offset:2
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; CHECK-NEXT: s_endpgm
|
|
call void @use_module()
|
|
store i16 1, ptr addrspace(3) @module_variable
|
|
|
|
store i16 2, ptr addrspace(3) @kernel_normal
|
|
|
|
call void @use_extern_normal()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) #1 {
|
|
; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_getpc_b64 s[6:7]
|
|
; CHECK-NEXT: s_add_u32 s6, s6, use_extern_normal@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_normal@gotpcrel32@hi+12
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 2
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_mov_b32 s15, 2
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
|
; CHECK-NEXT: s_endpgm
|
|
store i16 2, ptr addrspace(3) @kernel_overalign
|
|
|
|
call void @use_extern_normal()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) {
|
|
; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
|
; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
|
|
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
|
; CHECK-NEXT: s_add_u32 s4, s4, use_extern_normal@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_normal@gotpcrel32@hi+12
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 2
|
|
; CHECK-NEXT: s_mov_b32 s15, 6
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b16 v0, v2 offset:4
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; CHECK-NEXT: s_endpgm
|
|
call void @use_module()
|
|
store i16 1, ptr addrspace(3) @module_variable
|
|
|
|
store i16 2, ptr addrspace(3) @kernel_overalign
|
|
|
|
call void @use_extern_normal()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) #1 {
|
|
; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_getpc_b64 s[6:7]
|
|
; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 2
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_mov_b32 s15, 1
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
|
; CHECK-NEXT: s_endpgm
|
|
store i16 2, ptr addrspace(3) @kernel_normal
|
|
|
|
call void @use_extern_overalign()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) {
|
|
; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
|
; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
|
|
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
|
; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 2
|
|
; CHECK-NEXT: s_mov_b32 s15, 5
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b16 v0, v2 offset:2
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; CHECK-NEXT: s_endpgm
|
|
call void @use_module()
|
|
store i16 1, ptr addrspace(3) @module_variable
|
|
|
|
store i16 2, ptr addrspace(3) @kernel_normal
|
|
|
|
call void @use_extern_overalign()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) #1 {
|
|
; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_getpc_b64 s[6:7]
|
|
; CHECK-NEXT: s_add_u32 s6, s6, use_extern_overalign@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s7, s7, use_extern_overalign@gotpcrel32@hi+12
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 2
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_mov_b32 s15, 3
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[8:9]
|
|
; CHECK-NEXT: s_endpgm
|
|
store i16 2, ptr addrspace(3) @kernel_overalign
|
|
|
|
call void @use_extern_overalign()
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) {
|
|
; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s11
|
|
; CHECK-NEXT: s_mov_b32 s32, 0
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
|
|
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
|
|
; CHECK-NEXT: s_add_u32 s0, s0, s11
|
|
; CHECK-NEXT: s_addc_u32 s1, s1, 0
|
|
; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
|
|
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
|
; CHECK-NEXT: s_add_u32 s4, s4, use_module@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, use_module@gotpcrel32@hi+12
|
|
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; CHECK-NEXT: s_getpc_b64 s[4:5]
|
|
; CHECK-NEXT: s_add_u32 s4, s4, use_extern_overalign@gotpcrel32@lo+4
|
|
; CHECK-NEXT: s_addc_u32 s5, s5, use_extern_overalign@gotpcrel32@hi+12
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, 1
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 2
|
|
; CHECK-NEXT: s_mov_b32 s15, 7
|
|
; CHECK-NEXT: ds_write_b16 v0, v1
|
|
; CHECK-NEXT: ds_write_b16 v0, v2 offset:4
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
|
|
; CHECK-NEXT: s_endpgm
|
|
call void @use_module()
|
|
store i16 1, ptr addrspace(3) @module_variable
|
|
|
|
store i16 2, ptr addrspace(3) @kernel_overalign
|
|
|
|
call void @use_extern_overalign()
|
|
ret void
|
|
}
|
|
|
|
|
|
attributes #0 = { noinline }
|
|
attributes #1 = { "amdgpu-elide-module-lds" }
|