Files
clang-p2996/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
Nikita Popov 9deee6bffa [SDAG] Don't transfer !range metadata without !noundef to SDAG (PR64589)
D141386 changed the semantics of !range metadata to return poison
on violation. If !range is combined with !noundef, violation is
immediate UB instead, matching the old semantics.

In theory, these IR semantics should also carry over into SDAG.
In practice, DAGCombine has at least one key transform that is
invalid in the presence of poison, namely the conversion of logical
and/or to bitwise and/or (c7b537bf09/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (L11252)).
Ideally, we would fix this transform, but this will require
substantial work to avoid codegen regressions.

In the meantime, avoid transferring !range metadata without
!noundef, effectively restoring the old !range metadata semantics
on the SDAG layer.

Fixes https://github.com/llvm/llvm-project/issues/64589.

Differential Revision: https://reviews.llvm.org/D157685
2023-08-14 09:04:27 +02:00

3366 lines
137 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
; Test using saddr addressing mode of global_*load_* flat instructions.
; --------------------------------------------------------------------------------
; No vgpr offset, constants
; --------------------------------------------------------------------------------
; SGPR base only
define amdgpu_ps float @global_load_saddr_i8_offset_0(ptr addrspace(1) inreg %sbase) {
; GCN-LABEL: global_load_saddr_i8_offset_0:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%load = load i8, ptr addrspace(1) %sbase
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum gfx9 immediate offset
define amdgpu_ps float @global_load_saddr_i8_offset_4095(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_4095:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_4095:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_4095:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum gfx9 immediate offset + 1
define amdgpu_ps float @global_load_saddr_i8_offset_4096(ptr addrspace(1) inreg %sbase) {
; GCN-LABEL: global_load_saddr_i8_offset_4096:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, 0x1000
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_4096:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4096
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum gfx9 immediate offset + 2
define amdgpu_ps float @global_load_saddr_i8_offset_4097(ptr addrspace(1) inreg %sbase) {
; GCN-LABEL: global_load_saddr_i8_offset_4097:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, 0x1000
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_4097:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4097
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum negative gfx9 immediate offset
define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4096:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg4096:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4096
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum negative gfx9 immediate offset -1
define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg4097:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s2, 0xffffefff
; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4097:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg4097:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4097
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum negative gfx9 immediate offset -2
define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg4098:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s2, 0xffffeffe
; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4098:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg4098:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4098
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum gfx10 immediate offset
define amdgpu_ps float @global_load_saddr_i8_offset_2048(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_2048:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_2048:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_2048:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2048
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum gfx10 immediate offset + 1
define amdgpu_ps float @global_load_saddr_i8_offset_2049(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_2049:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2049
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_2049:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_2049:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2049
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum gfx10 immediate offset + 2
define amdgpu_ps float @global_load_saddr_i8_offset_2050(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_2050:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2050
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_2050:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_2050:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 2050
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum negative gfx10 immediate offset
define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(ptr addrspace(1) inreg %sbase) {
; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg2048:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2048
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum negative gfx10 immediate offset - 1
define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg2049:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg2049:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2049
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; SGPR base with maximum negative gfx10 immediate offset - 1
define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2050
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg2050:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg2050:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -2050
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_4294967295:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0xfffff000
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294967295:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, 0xfffff800
; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_4294967295:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, 0xfffff000
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967295
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_4294967296:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294967296:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_4294967296:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967296
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_4294967297:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294967297:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_4294967297:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294967297
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_4294971391:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
; GFX9-NEXT: s_addc_u32 s1, s3, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294971391:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_4294971391:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971391
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_4294971392:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
; GFX9-NEXT: s_addc_u32 s1, s3, 1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294971392:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_4294971392:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4294971392
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967295:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967295
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967296:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967296
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(ptr addrspace(1) inreg %sbase) {
; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967297:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1]
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 -4294967297
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; --------------------------------------------------------------------------------
; Basic addressing patterns
; --------------------------------------------------------------------------------
; Basic pattern, no immediate offset.
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i8_zext_vgpr:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum positive offset on gfx9
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum positive offset on gfx9 + 1
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4096
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum negative offset on gfx9
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4096
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum negative offset on gfx9 - 1
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -4097
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum positive offset on gfx10
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2047
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum positive offset on gfx10 + 1
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2048
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum negative offset on gfx10
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2048
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum negative offset on gfx10 - 1
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -2049
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Maximum positive offset on gfx9, and immediate needs to be moved lower.
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 4095
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; pointer addressing done in integers
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
%add = add i64 %sbase.as.int, %zext.offset
%dirty.gep = inttoptr i64 %add to ptr addrspace(1)
%load = load i8, ptr addrspace(1) %dirty.gep
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; zext forced to LHS of addressing expression
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
%add = add i64 %zext.offset, %sbase.as.int
%dirty.gep = inttoptr i64 %add to ptr addrspace(1)
%load = load i8, ptr addrspace(1) %dirty.gep
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; zext forced to LHS of addressing expression, with immediate offset
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
%add = add i64 %zext.offset, %sbase.as.int
%add.immoffset = add i64 %add, 128
%dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
%load = load i8, ptr addrspace(1) %dirty.gep
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
%add.immoffset = add i64 %sbase.as.int, 128
%add = add i64 %zext.offset, %add.immoffset
%dirty.gep = inttoptr i64 %add to ptr addrspace(1)
%load = load i8, ptr addrspace(1) %dirty.gep
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; --------------------------------------------------------------------------------
; Uniformity edge cases
; --------------------------------------------------------------------------------
@ptr.in.lds = internal addrspace(3) global ptr addrspace(1) undef
; Base pointer is uniform, but also in VGPRs
define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: ds_read_b64 v[1:2], v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: s_nop 4
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ds_read_b64 v[1:2], v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: ds_load_b64 v[1:2], v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Base pointer is uniform, but also in VGPRs, with imm offset
define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: ds_read_b64 v[1:2], v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: s_nop 4
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: ds_read_b64 v[1:2], v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: ds_load_b64 v[1:2], v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:42
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 42
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Both 64-bit base and 32-bit offset are scalar
define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -24
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Both components uniform, zext forced to LHS of addressing expression
define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
%add = add i64 %zext.offset, %sbase.as.int
%dirty.gep = inttoptr i64 %add to ptr addrspace(1)
%load = load i8, ptr addrspace(1) %dirty.gep
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr addrspace(1) inreg %sbase, i32 inreg %soffset) {
; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%sbase.as.int = ptrtoint ptr addrspace(1) %sbase to i64
%add = add i64 %zext.offset, %sbase.as.int
%add.immoffset = add i64 %add, 128
%dirty.gep = inttoptr i64 %add.immoffset to ptr addrspace(1)
%load = load i8, ptr addrspace(1) %dirty.gep
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; divergent 64-bit base, 32-bit scalar offset.
define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(ptr addrspace(1) %vbase, i32 inreg %soffset) {
; GFX9-LABEL: global_load_i8_vgpr64_sgpr32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_i8_vgpr64_sgpr32:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_i8_vgpr64_sgpr32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; divergent 64-bit base, 32-bit scalar offset, with imm offset
define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(ptr addrspace(1) %vbase, i32 inreg %soffset) {
; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %vbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 4095
%load = load i8, ptr addrspace(1) %gep1
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; --------------------------------------------------------------------------------
; Natural addressing shifts with restricted range
; --------------------------------------------------------------------------------
; Cannot push the shift into 32-bits, and cannot match.
define amdgpu_ps float @global_load_saddr_f32_natural_addressing(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
; GFX9-LABEL: global_load_saddr_f32_natural_addressing:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_saddr_f32_natural_addressing:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_f32_natural_addressing:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr
%zext.offset = zext i32 %voffset to i64
%gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load float, ptr addrspace(1) %gep
ret float %load
}
; Cannot push the shift into 32-bits, with an immediate offset.
define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 128
%load = load float, ptr addrspace(1) %gep1
ret float %load
}
; Range is sufficiently restricted to push the shift into 32-bits.
define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: global_load_dword v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
%zext.offset = zext i32 %voffset to i64
%gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load float, ptr addrspace(1) %gep
ret float %load
}
; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:400
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !0, !noundef !{}
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds float, ptr addrspace(1) %gep0, i64 100
%load = load float, ptr addrspace(1) %gep1
ret float %load
}
; Range is 1 beyond the limit where we can move the shift into 32-bits.
define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addrspace(1) inreg %sbase, ptr addrspace(1) %voffset.ptr) {
; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
; GFX11-NEXT: global_load_b32 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%voffset = load i32, ptr addrspace(1) %voffset.ptr, !range !1, !noundef !{}
%zext.offset = zext i32 %voffset to i64
%gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load float, ptr addrspace(1) %gep
ret float %load
}
; --------------------------------------------------------------------------------
; Stress various type loads
; --------------------------------------------------------------------------------
define amdgpu_ps half @global_load_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%cast.load = bitcast i16 %load to half
ret half %cast.load
}
define amdgpu_ps half @global_load_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%cast.load = bitcast i16 %load to half
ret half %cast.load
}
define amdgpu_ps half @global_load_saddr_f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_f16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load half, ptr addrspace(1) %gep0
ret half %load
}
define amdgpu_ps half @global_load_saddr_f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_f16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_f16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load half, ptr addrspace(1) %gep1
ret half %load
}
define amdgpu_ps float @global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i32:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i32, ptr addrspace(1) %gep0
%cast.load = bitcast i32 %load to float
ret float %cast.load
}
define amdgpu_ps float @global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i32_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i32, ptr addrspace(1) %gep1
%cast.load = bitcast i32 %load to float
ret float %cast.load
}
define amdgpu_ps float @global_load_saddr_f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_f32:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load float, ptr addrspace(1) %gep0
ret float %load
}
define amdgpu_ps float @global_load_saddr_f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_f32_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_f32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load float, ptr addrspace(1) %gep1
ret float %load
}
define amdgpu_ps <2 x half> @global_load_saddr_v2i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <2 x i16>, ptr addrspace(1) %gep0
%cast.load = bitcast <2 x i16> %load to <2 x half>
ret <2 x half> %cast.load
}
define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2i16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2i16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <2 x i16>, ptr addrspace(1) %gep1
%cast.load = bitcast <2 x i16> %load to <2 x half>
ret <2 x half> %cast.load
}
define amdgpu_ps <2 x half> @global_load_saddr_v2f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2f16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <2 x half>, ptr addrspace(1) %gep0
ret <2 x half> %load
}
define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2f16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2f16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <2 x half>, ptr addrspace(1) %gep1
ret <2 x half> %load
}
define amdgpu_ps <2 x half> @global_load_saddr_p3(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_p3:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_p3:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load ptr addrspace(3), ptr addrspace(1) %gep0
%cast.load0 = ptrtoint ptr addrspace(3) %load to i32
%cast.load1 = bitcast i32 %cast.load0 to <2 x half>
ret <2 x half> %cast.load1
}
define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_p3_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_p3_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load ptr addrspace(3), ptr addrspace(1) %gep1
%cast.load0 = ptrtoint ptr addrspace(3) %load to i32
%cast.load1 = bitcast i32 %cast.load0 to <2 x half>
ret <2 x half> %cast.load1
}
define amdgpu_ps <2 x float> @global_load_saddr_f64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_f64:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load double, ptr addrspace(1) %gep0
%cast.load = bitcast double %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_f64_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_f64_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load double, ptr addrspace(1) %gep1
%cast.load = bitcast double %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i64:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i64, ptr addrspace(1) %gep0
%cast.load = bitcast i64 %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i64_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i64_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i64, ptr addrspace(1) %gep1
%cast.load = bitcast i64 %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_v2f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <2 x float>, ptr addrspace(1) %gep0
ret <2 x float> %load
}
define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2f32_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2f32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <2 x float>, ptr addrspace(1) %gep1
ret <2 x float> %load
}
define amdgpu_ps <2 x float> @global_load_saddr_v2i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <2 x i32>, ptr addrspace(1) %gep0
%cast.load = bitcast <2 x i32> %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2i32_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2i32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <2 x i32>, ptr addrspace(1) %gep1
%cast.load = bitcast <2 x i32> %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_v4i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <4 x i16>, ptr addrspace(1) %gep0
%cast.load = bitcast <4 x i16> %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4i16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4i16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <4 x i16>, ptr addrspace(1) %gep1
%cast.load = bitcast <4 x i16> %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_v4f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <4 x half>, ptr addrspace(1) %gep0
%cast.load = bitcast <4 x half> %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4f16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4f16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <4 x half>, ptr addrspace(1) %gep1
%cast.load = bitcast <4 x half> %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @global_load_saddr_p1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_p1:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_p1:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load ptr addrspace(1), ptr addrspace(1) %gep0
%cast.load0 = ptrtoint ptr addrspace(1) %load to i64
%cast.load1 = bitcast i64 %cast.load0 to <2 x float>
ret <2 x float> %cast.load1
}
define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_p1_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_p1_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load ptr addrspace(1), ptr addrspace(1) %gep1
%cast.load0 = ptrtoint ptr addrspace(1) %load to i64
%cast.load1 = bitcast i64 %cast.load0 to <2 x float>
ret <2 x float> %cast.load1
}
define amdgpu_ps <3 x float> @global_load_saddr_v3f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v3f32:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v3f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <3 x float>, ptr addrspace(1) %gep0
ret <3 x float> %load
}
define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v3f32_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v3f32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <3 x float>, ptr addrspace(1) %gep1
ret <3 x float> %load
}
define amdgpu_ps <3 x float> @global_load_saddr_v3i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v3i32:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v3i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <3 x i32>, ptr addrspace(1) %gep0
%cast.load = bitcast <3 x i32> %load to <3 x float>
ret <3 x float> %cast.load
}
define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v3i32_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v3i32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <3 x i32>, ptr addrspace(1) %gep1
%cast.load = bitcast <3 x i32> %load to <3 x float>
ret <3 x float> %cast.load
}
define amdgpu_ps <6 x half> @global_load_saddr_v6f16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v6f16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v6f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <6 x half>, ptr addrspace(1) %gep0
ret <6 x half> %load
}
define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v6f16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v6f16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <6 x half>, ptr addrspace(1) %gep1
ret <6 x half> %load
}
define amdgpu_ps <4 x float> @global_load_saddr_v4f32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4f32:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <4 x float>, ptr addrspace(1) %gep0
ret <4 x float> %load
}
define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4f32_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4f32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <4 x float>, ptr addrspace(1) %gep1
ret <4 x float> %load
}
define amdgpu_ps <4 x float> @global_load_saddr_v4i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <4 x i32>, ptr addrspace(1) %gep0
%cast.load = bitcast <4 x i32> %load to <4 x float>
ret <4 x float> %cast.load
}
define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4i32_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4i32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <4 x i32>, ptr addrspace(1) %gep1
%cast.load = bitcast <4 x i32> %load to <4 x float>
ret <4 x float> %cast.load
}
define amdgpu_ps <4 x float> @global_load_saddr_v2i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2i64:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <2 x i64>, ptr addrspace(1) %gep0
%cast.load = bitcast <2 x i64> %load to <4 x float>
ret <4 x float> %cast.load
}
define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2i64_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2i64_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <2 x i64>, ptr addrspace(1) %gep1
%cast.load = bitcast <2 x i64> %load to <4 x float>
ret <4 x float> %cast.load
}
define amdgpu_ps <4 x float> @global_load_saddr_i128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i128, ptr addrspace(1) %gep0
%cast.load = bitcast i128 %load to <4 x float>
ret <4 x float> %cast.load
}
define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i128_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i128_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i128, ptr addrspace(1) %gep1
%cast.load = bitcast i128 %load to <4 x float>
ret <4 x float> %cast.load
}
define amdgpu_ps <4 x float> @global_load_saddr_v2p1(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2p1:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2p1:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep0
%cast.load0 = ptrtoint <2 x ptr addrspace(1)> %load to <2 x i64>
%cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
ret <4 x float> %cast.load1
}
define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v2p1_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v2p1_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <2 x ptr addrspace(1)>, ptr addrspace(1) %gep1
%cast.load0 = ptrtoint <2 x ptr addrspace(1)> %load to <2 x i64>
%cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
ret <4 x float> %cast.load1
}
define amdgpu_ps <4 x float> @global_load_saddr_v4p3(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4p3:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4p3:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep0
%cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
%cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
ret <4 x float> %cast.load1
}
define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_v4p3_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_v4p3_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load <4 x ptr addrspace(3)>, ptr addrspace(1) %gep1
%cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32>
%cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
ret <4 x float> %cast.load1
}
; --------------------------------------------------------------------------------
; Extending loads
; --------------------------------------------------------------------------------
define amdgpu_ps float @global_sextload_saddr_i8(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_sextload_saddr_i8:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_sbyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_sextload_saddr_i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_i8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%sextload = sext i8 %load to i32
%cast.load = bitcast i32 %sextload to float
ret float %cast.load
}
define amdgpu_ps float @global_sextload_saddr_i8_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_sextload_saddr_i8_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_sextload_saddr_i8_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i8, ptr addrspace(1) %gep1
%sextload = sext i8 %load to i32
%cast.load = bitcast i32 %sextload to float
ret float %cast.load
}
define amdgpu_ps float @global_sextload_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_sextload_saddr_i16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_sshort v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_sextload_saddr_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_i16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%sextload = sext i16 %load to i32
%cast.load = bitcast i32 %sextload to float
ret float %cast.load
}
define amdgpu_ps float @global_sextload_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_sextload_saddr_i16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_sextload_saddr_i16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%sextload = sext i16 %load to i32
%cast.load = bitcast i32 %sextload to float
ret float %cast.load
}
define amdgpu_ps float @global_zextload_saddr_i8(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_zextload_saddr_i8:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_zextload_saddr_i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%zextload = zext i8 %load to i32
%cast.load = bitcast i32 %zextload to float
ret float %cast.load
}
define amdgpu_ps float @global_zextload_saddr_i8_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_zextload_saddr_i8_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_zextload_saddr_i8_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i8, ptr addrspace(1) %gep1
%zextload = zext i8 %load to i32
%cast.load = bitcast i32 %zextload to float
ret float %cast.load
}
define amdgpu_ps float @global_zextload_saddr_i16(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_zextload_saddr_i16:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ushort v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_zextload_saddr_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%zextload = zext i16 %load to i32
%cast.load = bitcast i32 %zextload to float
ret float %cast.load
}
define amdgpu_ps float @global_zextload_saddr_i16_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_zextload_saddr_i16_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_zextload_saddr_i16_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%zextload = zext i16 %load to i32
%cast.load = bitcast i32 %zextload to float
ret float %cast.load
}
; --------------------------------------------------------------------------------
; Atomic load
; --------------------------------------------------------------------------------
define amdgpu_ps float @atomic_global_load_saddr_i32(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: atomic_global_load_saddr_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: atomic_global_load_saddr_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: atomic_global_load_saddr_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load atomic i32, ptr addrspace(1) %gep0 seq_cst, align 4
%cast.load = bitcast i32 %load to float
ret float %cast.load
}
define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: atomic_global_load_saddr_i32_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load atomic i32, ptr addrspace(1) %gep1 seq_cst, align 4
%cast.load = bitcast i32 %load to float
ret float %cast.load
}
define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: atomic_global_load_saddr_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: atomic_global_load_saddr_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: atomic_global_load_saddr_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load atomic i64, ptr addrspace(1) %gep0 seq_cst, align 8
%cast.load = bitcast i64 %load to <2 x float>
ret <2 x float> %cast.load
}
define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: atomic_global_load_saddr_i64_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load atomic i64, ptr addrspace(1) %gep1 seq_cst, align 8
%cast.load = bitcast i64 %load to <2 x float>
ret <2 x float> %cast.load
}
; --------------------------------------------------------------------------------
; D16 load (low 16)
; --------------------------------------------------------------------------------
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%build = insertelement <2 x i16> undef, i16 %load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%build = insertelement <2 x i16> undef, i16 %load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%build = insertelement <2 x i16> %reg, i16 %load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%build = insertelement <2 x i16> %reg, i16 %load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%zext.load = zext i8 %load to i16
%build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i8, ptr addrspace(1) %gep1
%zext.load = zext i8 %load to i16
%build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%sext.load = sext i8 %load to i16
%build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i8, ptr addrspace(1) %gep1
%sext.load = sext i8 %load to i16
%build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
; --------------------------------------------------------------------------------
; D16 hi load (hi16)
; --------------------------------------------------------------------------------
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%build = insertelement <2 x i16> undef, i16 %load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%build = insertelement <2 x i16> undef, i16 %load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset) {
; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i16, ptr addrspace(1) %gep0
%build = insertelement <2 x i16> %reg, i16 %load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i16, ptr addrspace(1) %gep1
%build = insertelement <2 x i16> %reg, i16 %load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%zext.load = zext i8 %load to i16
%build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i8, ptr addrspace(1) %gep1
%zext.load = zext i8 %load to i16
%build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i8, ptr addrspace(1) %gep0
%sext.load = sext i8 %load to i16
%build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr addrspace(1) inreg %sbase, i32 %voffset, <2 x i16> %reg) {
; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
%load = load i8, ptr addrspace(1) %gep1
%sext.load = sext i8 %load to i16
%build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
%cast = bitcast <2 x i16> %build to <2 x half>
ret <2 x half> %cast
}
; --------------------------------------------------------------------------------
; or-with-constant as add
; --------------------------------------------------------------------------------
; Check add-as-or with split 64-bit or.
define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) {
; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
; GCN: ; %bb.0:
; GCN-NEXT: v_or_b32_e32 v0, 16, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: global_load_ubyte v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_or_b32_e32 v0, 16, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.idx = zext i32 %idx to i64
%or = or i64 %zext.idx, 16
%addr = inttoptr i64 %or to ptr addrspace(1)
%load = load i8, ptr addrspace(1) %addr
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
; GCN: ; %bb.0:
; GCN-NEXT: v_or_b32_e32 v0, 0x1040, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: global_load_ubyte v0, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_or_b32_e32 v0, 0x1040, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
%zext.idx = zext i32 %idx to i64
%or = or i64 %zext.idx, 4160
%addr = inttoptr i64 %or to ptr addrspace(1)
%load = load i8, ptr addrspace(1) %addr
%zext = zext i8 %load to i32
%to.vgpr = bitcast i32 %zext to float
ret float %to.vgpr
}
; --------------------------------------------------------------------------------
; Full 64-bit scalar add.
; --------------------------------------------------------------------------------
define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) {
; GFX9-LABEL: global_addr_64bit_lsr_iv:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB128_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_add_u32 s4, s2, s0
; GFX9-NEXT: s_addc_u32 s5, s3, s1
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_add_u32 s0, s0, 4
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: s_cbranch_scc0 .LBB128_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_addr_64bit_lsr_iv:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB128_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_add_u32 s4, s2, s0
; GFX10-NEXT: s_addc_u32 s5, s3, s1
; GFX10-NEXT: s_add_u32 s0, s0, 4
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX10-NEXT: s_cbranch_scc0 .LBB128_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_addr_64bit_lsr_iv:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: .LBB128_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s4, s2, s0
; GFX11-NEXT: s_addc_u32 s5, s3, s1
; GFX11-NEXT: s_add_u32 s0, s0, 4
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_addc_u32 s1, s1, 0
; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX11-NEXT: s_cbranch_scc0 .LBB128_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
bb:
br label %bb3
bb2: ; preds = %bb3
ret void
bb3: ; preds = %bb3, %bb
%i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
%i4 = zext i32 %i to i64
%i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
%i6 = load volatile float, ptr addrspace(1) %i5, align 4
%i8 = add nuw nsw i32 %i, 1
%i9 = icmp eq i32 %i8, 256
br i1 %i9, label %bb2, label %bb3
}
; Make sure we only have a single zero vaddr initialization.
define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg %arg, ptr addrspace(1) inreg %arg.1) {
; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB129_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_add_u32 s4, s2, s0
; GFX9-NEXT: s_addc_u32 s5, s3, s1
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_add_u32 s0, s0, 4
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: ; kill: killed $sgpr4 killed $sgpr5
; GFX9-NEXT: s_cbranch_scc0 .LBB129_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: .LBB129_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_add_u32 s4, s2, s0
; GFX10-NEXT: s_addc_u32 s5, s3, s1
; GFX10-NEXT: s_add_u32 s0, s0, 4
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v1, v0, s[4:5] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX10-NEXT: ; kill: killed $sgpr4 killed $sgpr5
; GFX10-NEXT: s_cbranch_scc0 .LBB129_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: .LBB129_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_u32 s4, s2, s0
; GFX11-NEXT: s_addc_u32 s5, s3, s1
; GFX11-NEXT: s_add_u32 s0, s0, 4
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_addc_u32 s1, s1, 0
; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX11-NEXT: s_cbranch_scc0 .LBB129_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_endpgm
bb:
br label %bb3
bb2: ; preds = %bb3
ret void
bb3: ; preds = %bb3, %bb
%i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
%i4 = zext i32 %i to i64
%i5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %i4
%i6 = load volatile float, ptr addrspace(1) %i5, align 4
%i5.1 = getelementptr inbounds float, ptr addrspace(1) %arg.1, i64 %i4
%i6.1 = load volatile float, ptr addrspace(1) %i5, align 4
%i8 = add nuw nsw i32 %i, 1
%i9 = icmp eq i32 %i8, 256
br i1 %i9, label %bb2, label %bb3
}
!0 = !{i32 0, i32 1073741824} ; (1 << 30)
!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1