This is a NFC patch. This patch run a bulk update on CodeGen tests that are impacted by the true16 features. This patch applies: 1. duplicate GFX11plus runlines and apply them with "+mattr=+real-true16" and "+mattr=-real-true16" 2. update the test with the update script For some GISEL runlines, the current CodeGen do not fully support the true16 version. Still update the runlines, but comment out the failing one, and added a "FIXME-TRUE16" comment to that test for easier tracking. These test will be fixed in the following patches. This is in a transition state that we support both "+real-true16/-real-true16" in our code base. We plan to move to "+real-true16" as default, and finally remove "-real-true16" mode and test lines.
226 lines
11 KiB
LLVM
226 lines
11 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
|
|
; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
|
|
; RUN: llc -mcpu=gfx810 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
|
|
; RUN: llc -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
|
|
; RUN: llc -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
|
|
; RUN: llc -mcpu=gfx1200 -mattr=+real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
|
|
; RUN: llc -mcpu=gfx1200 -mattr=-real-true16 -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
|
|
|
|
@esgs_ring = external addrspace(3) global [0 x i32], align 65536
|
|
|
|
define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) {
|
|
; GFX10-LABEL: main:
|
|
; GFX10: ; %bb.0: ; %bb
|
|
; GFX10-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX10-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
|
|
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX10-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GFX10-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GFX10-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX10-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
|
|
; GFX10-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX10-NEXT: s_and_saveexec_b32 s0, s0
|
|
; GFX10-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
|
|
; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
|
; GFX10-NEXT: ; implicit-def: $vgpr4
|
|
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
|
|
; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
|
; GFX10-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX10-NEXT: ; %bb.2:
|
|
; GFX10-NEXT: s_mov_b32 exec_lo, s1
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v5
|
|
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v6
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
|
; GFX10-NEXT: ds_write2_b32 v2, v0, v1 offset0:7 offset1:8
|
|
;
|
|
; GFX9-LABEL: main:
|
|
; GFX9: ; %bb.0: ; %bb
|
|
; GFX9-NEXT: s_mov_b64 s[2:3], exec
|
|
; GFX9-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX9-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GFX9-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GFX9-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
|
|
; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX9-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX9-NEXT: s_nop 0
|
|
; GFX9-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
|
|
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
|
; GFX9-NEXT: ; implicit-def: $vgpr4
|
|
; GFX9-NEXT: s_xor_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX9-NEXT: ; %bb.2:
|
|
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
|
|
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v6
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX9-NEXT: ds_write2_b32 v2, v0, v1 offset0:7 offset1:8
|
|
;
|
|
; GFX8-LABEL: main:
|
|
; GFX8: ; %bb.0: ; %bb
|
|
; GFX8-NEXT: s_mov_b64 s[2:3], exec
|
|
; GFX8-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GFX8-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GFX8-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
|
|
; GFX8-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
|
|
; GFX8-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
|
|
; GFX8-NEXT: s_nop 0
|
|
; GFX8-NEXT: buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
|
|
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
|
; GFX8-NEXT: ; implicit-def: $vgpr4
|
|
; GFX8-NEXT: s_xor_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX8-NEXT: ; %bb.2:
|
|
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v5
|
|
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v6
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX8-NEXT: s_mov_b32 m0, -1
|
|
; GFX8-NEXT: ds_write2_b32 v2, v0, v1 offset0:7 offset1:8
|
|
;
|
|
; GFX11-TRUE16-LABEL: main:
|
|
; GFX11-TRUE16: ; %bb.0: ; %bb
|
|
; GFX11-TRUE16-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX11-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
|
|
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX11-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
|
|
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
|
|
; GFX11-TRUE16-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
|
|
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
|
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
|
|
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
|
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX11-TRUE16-NEXT: ; %bb.2:
|
|
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
|
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
|
|
; GFX11-TRUE16-NEXT: ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
|
|
;
|
|
; GFX11-FAKE16-LABEL: main:
|
|
; GFX11-FAKE16: ; %bb.0: ; %bb
|
|
; GFX11-FAKE16-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX11-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
|
|
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX11-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
|
|
; GFX11-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
|
|
; GFX11-FAKE16-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], 0 idxen
|
|
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
|
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
|
|
; GFX11-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
|
; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX11-FAKE16-NEXT: ; %bb.2:
|
|
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s1
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v5
|
|
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
|
|
; GFX11-FAKE16-NEXT: ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
|
|
;
|
|
; GFX12-TRUE16-LABEL: main:
|
|
; GFX12-TRUE16: ; %bb.0: ; %bb
|
|
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX12-TRUE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
|
|
; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-TRUE16-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
|
|
; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
|
; GFX12-TRUE16-NEXT: ; implicit-def: $vgpr4
|
|
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
|
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX12-TRUE16-NEXT: ; %bb.2:
|
|
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
|
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
|
|
; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
|
|
; GFX12-TRUE16-NEXT: ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
|
|
;
|
|
; GFX12-FAKE16-LABEL: main:
|
|
; GFX12-FAKE16: ; %bb.0: ; %bb
|
|
; GFX12-FAKE16-NEXT: s_mov_b32 s1, exec_lo
|
|
; GFX12-FAKE16-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
|
|
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
|
|
; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
|
|
; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0
|
|
; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-FAKE16-NEXT: s_and_saveexec_b32 s0, s0
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-FAKE16-NEXT: buffer_load_d16_format_xyz v[5:6], v4, s[4:7], null idxen
|
|
; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
|
|
; GFX12-FAKE16-NEXT: ; implicit-def: $vgpr4
|
|
; GFX12-FAKE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
|
|
; GFX12-FAKE16-NEXT: s_cbranch_execnz .LBB0_1
|
|
; GFX12-FAKE16-NEXT: ; %bb.2:
|
|
; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s1
|
|
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v5
|
|
; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xffff, v6
|
|
; GFX12-FAKE16-NEXT: ds_store_2addr_b32 v2, v0, v1 offset0:7 offset1:8
|
|
bb:
|
|
%i = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 poison)
|
|
%i2 = call nsz arcp <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32> %arg, i32 %arg1, i32 0, i32 0, i32 0)
|
|
%i3 = bitcast <3 x half> %i2 to <3 x i16>
|
|
%i4 = extractelement <3 x i16> %i3, i32 1
|
|
%i5 = bitcast <3 x half> %i2 to <3 x i16>
|
|
%i6 = extractelement <3 x i16> %i5, i32 2
|
|
%i7 = zext i16 %i4 to i32
|
|
%i8 = zext i16 %i6 to i32
|
|
%i9 = add nuw nsw i32 0, 7
|
|
%i10 = getelementptr [0 x i32], ptr addrspace(3) @esgs_ring, i32 0, i32 %i9
|
|
store i32 %i7, ptr addrspace(3) %i10, align 4
|
|
%i11 = add nuw nsw i32 0, 8
|
|
%i12 = getelementptr [0 x i32], ptr addrspace(3) @esgs_ring, i32 0, i32 %i11
|
|
store i32 %i8, ptr addrspace(3) %i12, align 4
|
|
unreachable
|
|
}
|
|
; Function Attrs: nounwind readnone willreturn
|
|
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
|
|
; Function Attrs: nounwind readonly willreturn
|
|
declare <3 x half> @llvm.amdgcn.struct.buffer.load.format.v3f16(<4 x i32>, i32, i32, i32, i32 immarg) #1
|
|
attributes #0 = { nounwind readnone willreturn }
|
|
attributes #1 = { nounwind readonly willreturn }
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; GFX11: {{.*}}
|
|
; GFX12: {{.*}}
|