The allocation order of 16 bit registers is vgpr0lo16, vgpr0hi16, vgpr1lo16, vgpr1hi16, vgpr2lo16.... We prefer (essentially require) that allocation order, because it uses the minimum number of registers. But when you have 16 bit data passing between 16 and 32 bit instructions you get lots of COPY. This patch teach the compiler that a COPY of a 16-bit value from a 32 bit register to a lo-half 16 bit register is free, to a hi-half 16 bit register is not. This might get improved to coalescing with additional cases, and perhaps as an alternative to the RA hints. For now upstreaming this solution first.
385 lines
14 KiB
LLVM
385 lines
14 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-TRUE16 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-FAKE16 %s
|
|
|
|
; FIXME: GFX9 should be producing v_mad_u16 instead of v_mad_legacy_u16.
|
|
|
|
define amdgpu_kernel void @mad_u16(
|
|
; GFX8-LABEL: mad_u16:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4
|
|
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s7
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
|
|
; GFX8-NEXT: flat_load_ushort v6, v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: flat_load_ushort v3, v[4:5] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: v_mad_u16 v2, v6, v2, v3
|
|
; GFX8-NEXT: flat_store_short v[0:1], v2
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: mad_u16:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: global_load_ushort v1, v0, s[10:11] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: global_load_ushort v2, v0, s[12:13] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: global_load_ushort v3, v0, s[14:15] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
|
|
; GFX9-NEXT: global_store_short v0, v1, s[8:9]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: mad_u16:
|
|
; GFX10: ; %bb.0: ; %entry
|
|
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: global_load_ushort v1, v0, s[10:11] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_ushort v2, v0, s[12:13] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: global_load_ushort v3, v0, s[14:15] glc dlc
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
|
; GFX10-NEXT: v_mad_u16 v1, v1, v2, v3
|
|
; GFX10-NEXT: global_store_short v0, v1, s[8:9]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-TRUE16-LABEL: mad_u16:
|
|
; GFX11-TRUE16: ; %bb.0: ; %entry
|
|
; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
|
|
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v1, s[6:7] glc dlc
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v1.l
|
|
; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
|
|
; GFX11-TRUE16-NEXT: s_endpgm
|
|
;
|
|
; GFX11-FAKE16-LABEL: mad_u16:
|
|
; GFX11-FAKE16: ; %bb.0: ; %entry
|
|
; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
|
|
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
|
|
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[6:7] glc dlc
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_mad_u16 v0, v1, v2, v0
|
|
; GFX11-FAKE16-NEXT: global_store_b16 v3, v0, s[0:1]
|
|
; GFX11-FAKE16-NEXT: s_endpgm
|
|
ptr addrspace(1) %r,
|
|
ptr addrspace(1) %a,
|
|
ptr addrspace(1) %b,
|
|
ptr addrspace(1) %c) {
|
|
entry:
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %a, i32 %tid
|
|
%b.gep = getelementptr inbounds i16, ptr addrspace(1) %b, i32 %tid
|
|
%c.gep = getelementptr inbounds i16, ptr addrspace(1) %c, i32 %tid
|
|
|
|
%a.val = load volatile i16, ptr addrspace(1) %a.gep
|
|
%b.val = load volatile i16, ptr addrspace(1) %b.gep
|
|
%c.val = load volatile i16, ptr addrspace(1) %c.gep
|
|
|
|
%m.val = mul i16 %a.val, %b.val
|
|
%r.val = add i16 %m.val, %c.val
|
|
|
|
store i16 %r.val, ptr addrspace(1) %r
|
|
ret void
|
|
}
|
|
|
|
define i16 @v_mad_u16(i16 %arg0, i16 %arg1, i16 %arg2) {
|
|
; GFX8-LABEL: v_mad_u16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mad_u16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mad_u16:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_mad_u16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_mad_u16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%mul = mul i16 %arg0, %arg1
|
|
%add = add i16 %mul, %arg2
|
|
ret i16 %add
|
|
}
|
|
|
|
define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
|
|
; GFX8-LABEL: v_mad_u16_zext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mad_u16_zext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mad_u16_zext:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_mad_u16_zext:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_mad_u16_zext:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%mul = mul i16 %arg0, %arg1
|
|
%add = add i16 %mul, %arg2
|
|
%zext = zext i16 %add to i32
|
|
ret i32 %zext
|
|
}
|
|
|
|
define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
|
|
; GFX8-LABEL: v_mad_u16_zext64:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mad_u16_zext64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mad_legacy_u16 v0, v0, v1, v2
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mad_u16_zext64:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_mad_u16_zext64:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l
|
|
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_mad_u16_zext64:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v2
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
%mul = mul i16 %arg0, %arg1
|
|
%add = add i16 %mul, %arg2
|
|
%zext = zext i16 %add to i64
|
|
ret i64 %zext
|
|
}
|
|
|
|
define amdgpu_ps i16 @s_mad_u16(i16 inreg %arg0, i16 inreg %arg1, i16 inreg %arg2) {
|
|
; GFX8-LABEL: s_mad_u16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mad_u16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10-LABEL: s_mad_u16:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX10-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX10-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_mad_u16:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX11-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
%mul = mul i16 %arg0, %arg1
|
|
%add = add i16 %mul, %arg2
|
|
ret i16 %add
|
|
}
|
|
|
|
define amdgpu_ps i32 @s_mad_u16_zext(i16 inreg %arg0, i16 inreg %arg1, i16 inreg %arg2) {
|
|
; GFX8-LABEL: s_mad_u16_zext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mad_u16_zext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10-LABEL: s_mad_u16_zext:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX10-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX10-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_mad_u16_zext:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX11-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
%mul = mul i16 %arg0, %arg1
|
|
%add = add i16 %mul, %arg2
|
|
%zext = zext i16 %add to i32
|
|
ret i32 %zext
|
|
}
|
|
|
|
define amdgpu_ps i64 @s_mad_u16_zext64(i16 inreg %arg0, i16 inreg %arg1, i16 inreg %arg2) {
|
|
; GFX8-LABEL: s_mad_u16_zext64:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX8-NEXT: s_mov_b32 s1, 0
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mad_u16_zext64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX9-NEXT: s_mov_b32 s1, 0
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10-LABEL: s_mad_u16_zext64:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX10-NEXT: s_mov_b32 s1, 0
|
|
; GFX10-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX10-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_mad_u16_zext64:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX11-NEXT: s_mov_b32 s1, 0
|
|
; GFX11-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
%mul = mul i16 %arg0, %arg1
|
|
%add = add i16 %mul, %arg2
|
|
%zext = zext i16 %add to i64
|
|
ret i64 %zext
|
|
}
|
|
|
|
define amdgpu_ps i32 @s_mad_u16_sext(i16 inreg %arg0, i16 inreg %arg1, i16 inreg %arg2) {
|
|
; GFX8-LABEL: s_mad_u16_sext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX8-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX8-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mad_u16_sext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX9-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10-LABEL: s_mad_u16_sext:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX10-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX10-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX10-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX11-LABEL: s_mad_u16_sext:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX11-NEXT: s_add_i32 s0, s0, s2
|
|
; GFX11-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX11-NEXT: ; return to shader part epilog
|
|
%mul = mul i16 %arg0, %arg1
|
|
%add = add i16 %mul, %arg2
|
|
%sext = sext i16 %add to i32
|
|
ret i32 %sext
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
; GCN: {{.*}}
|