[AMDGPU][True16][CodeGen] support for true16 for vinterp 16bit instructions (#116702)

vinterp 16bit instructions codeGen support in True16 format

Currently only enable two tests, will enable more when more true16
instructions are supported
This commit is contained in:
Brox Chen
2024-12-09 11:52:05 -05:00
committed by GitHub
parent 4c4606a743
commit 85142f5b35
3 changed files with 358 additions and 224 deletions

View File

@@ -181,9 +181,43 @@ multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
}
class VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
ValueType dstVT, bit high, bit isP2> : GCNPat <
(dstVT (op
(VINTERPMods f32:$src0, i32:$src0_modifiers),
(VINTERPMods f32:$src1, i32:$src1_modifiers),
(VINTERPMods f32:$src2, i32:$src2_modifiers),
!if(high, (i1 -1), (i1 0)))),
(inst $src0_modifiers,
(f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),
$src1_modifiers, VGPR_32:$src1,
$src2_modifiers,
!if(isP2, (f32 VGPR_32:$src2),
(f16 (EXTRACT_SUBREG VGPR_32:$src2, !if(high, hi16, lo16)))),
0, /* clamp */
7) /* wait_exp */
>;
multiclass VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
ValueType dstVT, bit isP2> {
def : VInterpF16Pat_t16<op, inst, dstVT, 0, isP2>;
def : VInterpF16Pat_t16<op, inst, dstVT, 1, isP2>;
}
def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
let True16Predicate = UseRealTrue16Insts in {
defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p10_f16,
V_INTERP_P10_F16_F32_inreg_t16, f32, 0>;
defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p2_f16,
V_INTERP_P2_F16_F32_inreg_t16, f16, 1>;
defm : VInterpF16Pat_t16<int_amdgcn_interp_p10_rtz_f16,
V_INTERP_P10_RTZ_F16_F32_inreg_t16, f32, 0>;
defm : VInterpF16Pat_t16<int_amdgcn_interp_p2_rtz_f16,
V_INTERP_P2_RTZ_F16_F32_inreg_t16, f16, 1>;
}
let True16Predicate = UseFakeTrue16Insts in {
defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
V_INTERP_P10_F16_F32_inreg_fake16, f32,

View File

@@ -1,25 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_f32:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s3, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: lds_param_load v0, attr0.y wait_vdst:15
; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
; GCN-NEXT: s_endpgm
; GFX11-LABEL: v_interp_f32:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s3, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: lds_param_load v0, attr0.y wait_vdst:15
; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: v_mov_b32_e32 v4, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
; GFX11-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done
; GFX11-NEXT: s_endpgm
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
%p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -32,30 +33,30 @@ main_body:
}
define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_f32_many:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s3, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: lds_param_load v0, attr0.x wait_vdst:15
; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15
; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15
; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
; GCN-NEXT: s_endpgm
; GFX11-LABEL: v_interp_f32_many:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s3, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: lds_param_load v0, attr0.x wait_vdst:15
; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
; GFX11-NEXT: lds_param_load v2, attr2.x wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr3.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GFX11-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GFX11-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
; GFX11-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done
; GFX11-NEXT: s_endpgm
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -74,30 +75,30 @@ main_body:
}
define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_f32_many_vm:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_mov_b32 s0, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: lds_param_load v2, attr0.x wait_vdst:15
; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15
; GCN-NEXT: lds_param_load v4, attr2.x wait_vdst:15
; GCN-NEXT: lds_param_load v5, attr3.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
; GCN-NEXT: s_endpgm
; GFX11-LABEL: v_interp_f32_many_vm:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
; GFX11-NEXT: s_mov_b32 m0, s0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: lds_param_load v2, attr0.x wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr1.x wait_vdst:15
; GFX11-NEXT: lds_param_load v4, attr2.x wait_vdst:15
; GFX11-NEXT: lds_param_load v5, attr3.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
; GFX11-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
; GFX11-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done
; GFX11-NEXT: s_endpgm
main_body:
%i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1
%i = load float, ptr addrspace(1) %i.ptr, align 4
@@ -120,23 +121,41 @@ main_body:
}
define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_f16:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s3, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
; GCN-NEXT: ; return to shader part epilog
; GFX11-TRUE16-LABEL: v_interp_f16:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: v_interp_f16:
; GFX11-FAKE16: ; %bb.0: ; %main_body
; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
@@ -148,23 +167,41 @@ main_body:
}
define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_rtz_f16:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s3, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
; GCN-NEXT: ; return to shader part epilog
; GFX11-TRUE16-LABEL: v_interp_rtz_f16:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: v_interp_rtz_f16:
; GFX11-FAKE16: ; %bb.0: ; %main_body
; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
@@ -176,17 +213,30 @@ main_body:
}
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
; GCN-LABEL: v_interp_f16_imm_params:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_add_f16_e32 v0, v1, v0
; GCN-NEXT: ; return to shader part epilog
; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: v_interp_f16_imm_params:
; GFX11-FAKE16: ; %bb.0: ; %main_body
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0)
%l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0)

View File

@@ -1,25 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_f32:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s3, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: lds_param_load v0, attr0.y wait_vdst:15
; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v4, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
; GCN-NEXT: s_endpgm
; GFX11-LABEL: v_interp_f32:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s3, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: lds_param_load v0, attr0.y wait_vdst:15
; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: v_mov_b32_e32 v4, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
; GFX11-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done
; GFX11-NEXT: s_endpgm
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
%p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -32,30 +33,30 @@ main_body:
}
define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_f32_many:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s3, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: lds_param_load v0, attr0.x wait_vdst:15
; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15
; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15
; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
; GCN-NEXT: s_endpgm
; GFX11-LABEL: v_interp_f32_many:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: s_mov_b32 s3, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s2
; GFX11-NEXT: lds_param_load v0, attr0.x wait_vdst:15
; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
; GFX11-NEXT: lds_param_load v2, attr2.x wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr3.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s3
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
; GFX11-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
; GFX11-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
; GFX11-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done
; GFX11-NEXT: s_endpgm
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -74,30 +75,30 @@ main_body:
}
define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_f32_many_vm:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
; GCN-NEXT: s_mov_b32 m0, s0
; GCN-NEXT: s_mov_b32 s0, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: lds_param_load v2, attr0.x wait_vdst:15
; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15
; GCN-NEXT: lds_param_load v4, attr2.x wait_vdst:15
; GCN-NEXT: lds_param_load v5, attr3.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
; GCN-NEXT: s_endpgm
; GFX11-LABEL: v_interp_f32_many_vm:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
; GFX11-NEXT: s_mov_b32 m0, s0
; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: lds_param_load v2, attr0.x wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr1.x wait_vdst:15
; GFX11-NEXT: lds_param_load v4, attr2.x wait_vdst:15
; GFX11-NEXT: lds_param_load v5, attr3.x wait_vdst:15
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
; GFX11-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
; GFX11-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done
; GFX11-NEXT: s_endpgm
main_body:
%i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1
%i = load float, ptr addrspace(1) %i.ptr, align 4
@@ -120,23 +121,41 @@ main_body:
}
define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_f16:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s3, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
; GCN-NEXT: ; return to shader part epilog
; GFX11-TRUE16-LABEL: v_interp_f16:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: v_interp_f16:
; GFX11-FAKE16: ; %bb.0: ; %main_body
; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
@@ -148,23 +167,41 @@ main_body:
}
define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
; GCN-LABEL: v_interp_rtz_f16:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: s_mov_b32 s3, exec_lo
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GCN-NEXT: s_mov_b32 exec_lo, s3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
; GCN-NEXT: ; return to shader part epilog
; GFX11-TRUE16-LABEL: v_interp_rtz_f16:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: v_interp_rtz_f16:
; GFX11-FAKE16: ; %bb.0: ; %main_body
; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
%l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
@@ -176,17 +213,30 @@ main_body:
}
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
; GCN-LABEL: v_interp_f16_imm_params:
; GCN: ; %bb.0: ; %main_body
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_add_f16_e32 v0, v1, v0
; GCN-NEXT: ; return to shader part epilog
; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
; GFX11-TRUE16: ; %bb.0: ; %main_body
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: v_interp_f16_imm_params:
; GFX11-FAKE16: ; %bb.0: ; %main_body
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
main_body:
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0)
%l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0)