At the moment, the emergency spill slot is a fixed object for entry
functions and chain functions, and a regular stack object otherwise.
This patch adopts the latter behaviour for entry/chain functions too. It
seems this was always the intention [1] and it will also save us a bit
of stack space in cases where the first stack object has a large
alignment.
[1]
34c8b835b1
261 lines
13 KiB
LLVM
261 lines
13 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
|
|
|
|
; Check lowering of some large insertelement that use the stack
|
|
; instead of register indexing.
|
|
|
|
define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr addrspace(1) %ptr, i32 %val, i32 %idx) #0 {
|
|
; GCN-LABEL: v_insert_v64i32_varidx:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_load_dwordx4 s[20:23], s[4:5], 0x0
|
|
; GCN-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x10
|
|
; GCN-NEXT: s_add_u32 s0, s0, s7
|
|
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v16, 0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0
|
|
; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40
|
|
; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80
|
|
; GCN-NEXT: v_mov_b32_e32 v64, 0
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN-NEXT: v_mov_b32_e32 v2, s38
|
|
; GCN-NEXT: v_mov_b32_e32 v3, s39
|
|
; GCN-NEXT: v_mov_b32_e32 v4, s40
|
|
; GCN-NEXT: v_mov_b32_e32 v5, s41
|
|
; GCN-NEXT: v_mov_b32_e32 v6, s42
|
|
; GCN-NEXT: v_mov_b32_e32 v7, s43
|
|
; GCN-NEXT: v_mov_b32_e32 v8, s44
|
|
; GCN-NEXT: v_mov_b32_e32 v9, s45
|
|
; GCN-NEXT: v_mov_b32_e32 v10, s46
|
|
; GCN-NEXT: v_mov_b32_e32 v11, s47
|
|
; GCN-NEXT: v_mov_b32_e32 v12, s48
|
|
; GCN-NEXT: v_mov_b32_e32 v13, s49
|
|
; GCN-NEXT: v_mov_b32_e32 v14, s50
|
|
; GCN-NEXT: v_mov_b32_e32 v15, s51
|
|
; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
|
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:4
|
|
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8
|
|
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:12
|
|
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:16
|
|
; GCN-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:20
|
|
; GCN-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24
|
|
; GCN-NEXT: buffer_store_dword v7, off, s[0:3], 0 offset:28
|
|
; GCN-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
|
|
; GCN-NEXT: buffer_store_dword v9, off, s[0:3], 0 offset:36
|
|
; GCN-NEXT: buffer_store_dword v10, off, s[0:3], 0 offset:40
|
|
; GCN-NEXT: buffer_store_dword v11, off, s[0:3], 0 offset:44
|
|
; GCN-NEXT: buffer_store_dword v12, off, s[0:3], 0 offset:48
|
|
; GCN-NEXT: buffer_store_dword v13, off, s[0:3], 0 offset:52
|
|
; GCN-NEXT: buffer_store_dword v14, off, s[0:3], 0 offset:56
|
|
; GCN-NEXT: buffer_store_dword v15, off, s[0:3], 0 offset:60
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s52
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:64
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s53
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:68
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s54
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:72
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s55
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:76
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s56
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:80
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s57
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:84
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s58
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:88
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s59
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:92
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s60
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:96
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s61
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:100
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s62
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:104
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s63
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:108
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s64
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:112
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s65
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:116
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s66
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:120
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s67
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:124
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s5
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:132
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:136
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s7
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:140
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:144
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s9
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:148
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s10
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:152
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s11
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:156
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s12
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:160
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s13
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:164
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s14
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:168
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s15
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:172
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s16
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:176
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s17
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:180
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s18
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:184
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s19
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:188
|
|
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:192
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s37
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:196
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s38
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:200
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s39
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:204
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s40
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:208
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s41
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:212
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s42
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:216
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s43
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:220
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s44
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:224
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s45
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:228
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s46
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:232
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s47
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:236
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s48
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:240
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s49
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:244
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s50
|
|
; GCN-NEXT: s_and_b32 s4, s25, 63
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:248
|
|
; GCN-NEXT: v_mov_b32_e32 v0, s51
|
|
; GCN-NEXT: s_lshl_b32 s4, s4, 2
|
|
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:252
|
|
; GCN-NEXT: v_add_u32_e32 v0, s4, v16
|
|
; GCN-NEXT: v_mov_b32_e32 v1, s24
|
|
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
|
|
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0
|
|
; GCN-NEXT: s_nop 0
|
|
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4
|
|
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:8
|
|
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], 0 offset:12
|
|
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], 0 offset:16
|
|
; GCN-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:20
|
|
; GCN-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:24
|
|
; GCN-NEXT: buffer_load_dword v7, off, s[0:3], 0 offset:28
|
|
; GCN-NEXT: buffer_load_dword v8, off, s[0:3], 0 offset:32
|
|
; GCN-NEXT: buffer_load_dword v9, off, s[0:3], 0 offset:36
|
|
; GCN-NEXT: buffer_load_dword v10, off, s[0:3], 0 offset:40
|
|
; GCN-NEXT: buffer_load_dword v11, off, s[0:3], 0 offset:44
|
|
; GCN-NEXT: buffer_load_dword v12, off, s[0:3], 0 offset:48
|
|
; GCN-NEXT: buffer_load_dword v13, off, s[0:3], 0 offset:52
|
|
; GCN-NEXT: buffer_load_dword v14, off, s[0:3], 0 offset:56
|
|
; GCN-NEXT: buffer_load_dword v15, off, s[0:3], 0 offset:60
|
|
; GCN-NEXT: buffer_load_dword v16, off, s[0:3], 0 offset:64
|
|
; GCN-NEXT: buffer_load_dword v17, off, s[0:3], 0 offset:68
|
|
; GCN-NEXT: buffer_load_dword v18, off, s[0:3], 0 offset:72
|
|
; GCN-NEXT: buffer_load_dword v19, off, s[0:3], 0 offset:76
|
|
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], 0 offset:80
|
|
; GCN-NEXT: buffer_load_dword v21, off, s[0:3], 0 offset:84
|
|
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], 0 offset:88
|
|
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], 0 offset:92
|
|
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], 0 offset:96
|
|
; GCN-NEXT: buffer_load_dword v25, off, s[0:3], 0 offset:100
|
|
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], 0 offset:104
|
|
; GCN-NEXT: buffer_load_dword v27, off, s[0:3], 0 offset:108
|
|
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], 0 offset:112
|
|
; GCN-NEXT: buffer_load_dword v29, off, s[0:3], 0 offset:116
|
|
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], 0 offset:120
|
|
; GCN-NEXT: buffer_load_dword v31, off, s[0:3], 0 offset:124
|
|
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], 0 offset:128
|
|
; GCN-NEXT: buffer_load_dword v33, off, s[0:3], 0 offset:132
|
|
; GCN-NEXT: buffer_load_dword v34, off, s[0:3], 0 offset:136
|
|
; GCN-NEXT: buffer_load_dword v35, off, s[0:3], 0 offset:140
|
|
; GCN-NEXT: buffer_load_dword v36, off, s[0:3], 0 offset:144
|
|
; GCN-NEXT: buffer_load_dword v37, off, s[0:3], 0 offset:148
|
|
; GCN-NEXT: buffer_load_dword v38, off, s[0:3], 0 offset:152
|
|
; GCN-NEXT: buffer_load_dword v39, off, s[0:3], 0 offset:156
|
|
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], 0 offset:160
|
|
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], 0 offset:164
|
|
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], 0 offset:168
|
|
; GCN-NEXT: buffer_load_dword v43, off, s[0:3], 0 offset:172
|
|
; GCN-NEXT: buffer_load_dword v44, off, s[0:3], 0 offset:176
|
|
; GCN-NEXT: buffer_load_dword v45, off, s[0:3], 0 offset:180
|
|
; GCN-NEXT: buffer_load_dword v46, off, s[0:3], 0 offset:184
|
|
; GCN-NEXT: buffer_load_dword v47, off, s[0:3], 0 offset:188
|
|
; GCN-NEXT: buffer_load_dword v48, off, s[0:3], 0 offset:192
|
|
; GCN-NEXT: buffer_load_dword v49, off, s[0:3], 0 offset:196
|
|
; GCN-NEXT: buffer_load_dword v50, off, s[0:3], 0 offset:200
|
|
; GCN-NEXT: buffer_load_dword v51, off, s[0:3], 0 offset:204
|
|
; GCN-NEXT: buffer_load_dword v52, off, s[0:3], 0 offset:208
|
|
; GCN-NEXT: buffer_load_dword v53, off, s[0:3], 0 offset:212
|
|
; GCN-NEXT: buffer_load_dword v54, off, s[0:3], 0 offset:216
|
|
; GCN-NEXT: buffer_load_dword v55, off, s[0:3], 0 offset:220
|
|
; GCN-NEXT: buffer_load_dword v56, off, s[0:3], 0 offset:224
|
|
; GCN-NEXT: buffer_load_dword v57, off, s[0:3], 0 offset:228
|
|
; GCN-NEXT: buffer_load_dword v58, off, s[0:3], 0 offset:232
|
|
; GCN-NEXT: buffer_load_dword v59, off, s[0:3], 0 offset:236
|
|
; GCN-NEXT: buffer_load_dword v60, off, s[0:3], 0 offset:240
|
|
; GCN-NEXT: buffer_load_dword v61, off, s[0:3], 0 offset:244
|
|
; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:248
|
|
; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:252
|
|
; GCN-NEXT: s_waitcnt vmcnt(60)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21]
|
|
; GCN-NEXT: s_waitcnt vmcnt(57)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16
|
|
; GCN-NEXT: s_waitcnt vmcnt(54)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32
|
|
; GCN-NEXT: s_waitcnt vmcnt(51)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48
|
|
; GCN-NEXT: s_waitcnt vmcnt(48)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64
|
|
; GCN-NEXT: s_waitcnt vmcnt(45)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80
|
|
; GCN-NEXT: s_waitcnt vmcnt(42)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96
|
|
; GCN-NEXT: s_waitcnt vmcnt(39)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112
|
|
; GCN-NEXT: s_waitcnt vmcnt(36)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128
|
|
; GCN-NEXT: s_waitcnt vmcnt(33)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144
|
|
; GCN-NEXT: s_waitcnt vmcnt(30)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160
|
|
; GCN-NEXT: s_waitcnt vmcnt(27)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176
|
|
; GCN-NEXT: s_waitcnt vmcnt(24)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192
|
|
; GCN-NEXT: s_waitcnt vmcnt(21)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208
|
|
; GCN-NEXT: s_waitcnt vmcnt(18)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224
|
|
; GCN-NEXT: s_waitcnt vmcnt(15)
|
|
; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[20:21] offset:240
|
|
; GCN-NEXT: s_endpgm
|
|
%vec = load <64 x i32>, ptr addrspace(1) %ptr
|
|
%insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx
|
|
store <64 x i32> %insert, ptr addrspace(1) %out.ptr
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,10" }
|