Files
clang-p2996/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
alex-t 1a33294652 [AMDGPU] Filtering out the inactive lanes bits when lowering copy to SCC
Normally, given that the DA results are kept consistent over the selection DAG, uniform comparisons get selected to S_CMP_* but divergent to V_CMP_*.  Sometimes, for the sake of efficiency,  SSA subgraphs may be converted to VALU to avoid repeatedly copying data back and forth. Hence we have to be able to sustain the correctness passing the i1 from VALU to SALU context and vice versa.

VALU operations only process the active lanes of the VGPR and ignore inactive ones.
Active lanes correspond to 1 bit in the EXEC mask register.
SALU represents i1 as just one bit but VALU as 64bits: 0/1 and 0/(0xffffffffffffffff & EXEC) respectively.
SALU uses one-bit conditional flag SCC but VALU - VCC that is a pair of 32-bit SGPRs

To expose SCC to the VALU context we need to convert the one-bit boolean value to the appropriate 64bit.
To return back to the SALU context we need to do the opposite.

To correctly convert 64bit VALU boolean to either 0 or 1 we need to filter out the bits corresponding to the inactive lanes.

Reviewed By: piotr

Differential Revision: https://reviews.llvm.org/D109900
2021-09-21 21:19:31 +03:00

38 lines
1.6 KiB
LLVM
Executable File

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @copy_to_scc(i32 addrspace(1)* %out, i32 addrspace(1)* %in, <4 x i32> addrspace(4)* %addrSrc) {
; GCN-LABEL: copy_to_scc:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-NEXT: s_load_dword s10, s[6:7], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:252
; GCN-NEXT: s_cmp_lg_u32 s10, 0
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GCN-NEXT: s_xor_b64 s[0:1], s[6:7], vcc
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
; GCN-NEXT: s_cselect_b32 s0, 2, 3
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: global_store_dword v0, v1, s[4:5]
; GCN-NEXT: s_endpgm
entry: ; preds = %1009
%0 = load i32, i32 addrspace(1)* %in, align 4
%1 = load <4 x i32>, <4 x i32> addrspace(4)* %addrSrc, align 16
%2 = icmp ne i32 %0, 0
%3 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %1, i32 252, i32 0, i32 0)
%4 = icmp ne i32 %3, 0
%5 = xor i1 %2, %4
%result = select i1 %5, i32 2, i32 3
store i32 %result, i32 addrspace(1)* %out
ret void
}
declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)