Files
clang-p2996/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
Alexander Timofeev fbdea5a2e9 [AMDGPU] Always select s_cselect_b32 for uniform 'select' SDNode
This patch contains changes necessary to carry physical condition register (SCC) dependencies through the SDNode scheduler.  It adds the edge in the SDNodeScheduler dependency graph instead of inserting the SCC copy between each definition and use. This approach lets the scheduler place instructions in an optimal way placing the copy only when the dependency cannot be resolved.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D133593
2022-09-15 22:03:56 +02:00

83 lines
3.1 KiB
LLVM

; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Combine on select c, (load x), (load y) -> load (select c, x, y)
; drops MachinePointerInfo, so it can't be relied on for correctness.
; GCN-LABEL: {{^}}select_ptr_crash_i64_flat:
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_cmp_eq_u32
; GCN: s_cselect_b32
; GCN: s_cselect_b32
; GCN-NOT: load_dword
; GCN: flat_load_dwordx2
; GCN-NOT: load_dword
; GCN: flat_store_dwordx2
define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %ptr0, [8 x i32], i64* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
%tmp2 = icmp eq i32 %tmp, 0
%tmp3 = load i64, i64* %ptr0, align 8
%tmp4 = load i64, i64* %ptr1, align 8
%tmp5 = select i1 %tmp2, i64 %tmp3, i64 %tmp4
store i64 %tmp5, i64 addrspace(1)* %ptr2, align 8
ret void
}
; The transform currently doesn't happen for non-addrspace 0, but it
; should.
; GCN-LABEL: {{^}}select_ptr_crash_i64_global:
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
; GCN: s_cselect_b32
; GCN: s_cselect_b32
; GCN: flat_store_dwordx2
define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
%tmp2 = icmp eq i32 %tmp, 0
%tmp3 = load i64, i64 addrspace(1)* %ptr0, align 8
%tmp4 = load i64, i64 addrspace(1)* %ptr1, align 8
%tmp5 = select i1 %tmp2, i64 %tmp3, i64 %tmp4
store i64 %tmp5, i64 addrspace(1)* %ptr2, align 8
ret void
}
; GCN-LABEL: {{^}}select_ptr_crash_i64_local:
; GCN: ds_read_b64
; GCN: ds_read_b64
; GCN: v_cndmask_b32
; GCN: v_cndmask_b32
; GCN: flat_store_dwordx2
define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, i64 addrspace(3)* %ptr0, i64 addrspace(3)* %ptr1, i64 addrspace(1)* %ptr2) {
%tmp2 = icmp eq i32 %tmp, 0
%tmp3 = load i64, i64 addrspace(3)* %ptr0, align 8
%tmp4 = load i64, i64 addrspace(3)* %ptr1, align 8
%tmp5 = select i1 %tmp2, i64 %tmp3, i64 %tmp4
store i64 %tmp5, i64 addrspace(1)* %ptr2, align 8
ret void
}
; The transform will break addressing mode matching, so unclear it
; would be good to do
; GCN-LABEL: {{^}}select_ptr_crash_i64_local_offsets:
; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:128
; GCN: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:512
; GCN: v_cndmask_b32
; GCN: v_cndmask_b32
define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, i64 addrspace(3)* %ptr0, i64 addrspace(3)* %ptr1, i64 addrspace(1)* %ptr2) {
%tmp2 = icmp eq i32 %tmp, 0
%gep0 = getelementptr inbounds i64, i64 addrspace(3)* %ptr0, i64 16
%gep1 = getelementptr inbounds i64, i64 addrspace(3)* %ptr1, i64 64
%tmp3 = load i64, i64 addrspace(3)* %gep0, align 8
%tmp4 = load i64, i64 addrspace(3)* %gep1, align 8
%tmp5 = select i1 %tmp2, i64 %tmp3, i64 %tmp4
store i64 %tmp5, i64 addrspace(1)* %ptr2, align 8
ret void
}