Files
clang-p2996/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Austin Kerbow ba0d079c7a [AMDGPU] Aggressively schedule to reduce RP in occupancy limited regions
By not clustering loads and adjusting heuristics to more aggressively reduce
register pressure we may be able to increase occupancy for the function if it
was dropped in a first pass scheduling.

Similarly, try to reduce spilling if register usage exceeds lower bound
occupancy.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D130329
2022-07-27 22:34:37 -07:00

1997 lines
81 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX7-LABEL: s_mul_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX7-LABEL: v_mul_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
; GFX7-LABEL: s_mul_i16_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16_zeroext:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX7-LABEL: v_mul_i16_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16_zeroext:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
; GFX7-LABEL: s_mul_i16_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: s_sext_i32_i16 s0, s0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s0, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_sext_i32_i16 s0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i16_signext:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX7-LABEL: v_mul_i16_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i16_signext:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GCN-LABEL: s_mul_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mul_i32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i32 %num, %den
ret i32 %result
}
define i32 @v_mul_i32(i32 %num, i32 %den) {
; GCN-LABEL: v_mul_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_i32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
}
define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GCN-LABEL: s_mul_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mul_i32 s0, s0, s2
; GCN-NEXT: s_mul_i32 s1, s1, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_v2i32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s3
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GCN-LABEL: v_mul_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
; GCN-NEXT: v_mul_lo_u32 v1, v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_mul_v2i32:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10PLUS-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
; GFX7-LABEL: s_mul_i33:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s4, s0, s2
; GFX7-NEXT: s_mul_i32 s0, s0, s3
; GFX7-NEXT: s_mul_i32 s1, s1, s2
; GFX7-NEXT: v_readfirstlane_b32 s5, v0
; GFX7-NEXT: s_add_u32 s0, s0, s5
; GFX7-NEXT: s_add_u32 s1, s1, s0
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i33:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s4, s0, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s3
; GFX8-NEXT: s_mul_i32 s1, s1, s2
; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: s_add_u32 s0, s0, s5
; GFX8-NEXT: s_add_u32 s1, s1, s0
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i33:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s4, s0, s2
; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s3
; GFX9-NEXT: s_add_u32 s0, s0, s5
; GFX9-NEXT: s_mul_i32 s1, s1, s2
; GFX9-NEXT: s_add_u32 s1, s1, s0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i33:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2
; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2
; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i33 %num, %den
ret i33 %result
}
define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX7-LABEL: s_mul_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s4, s0, s2
; GFX7-NEXT: s_mul_i32 s0, s0, s3
; GFX7-NEXT: s_mul_i32 s1, s1, s2
; GFX7-NEXT: v_readfirstlane_b32 s5, v0
; GFX7-NEXT: s_add_u32 s0, s0, s5
; GFX7-NEXT: s_add_u32 s1, s1, s0
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s4, s0, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s3
; GFX8-NEXT: s_mul_i32 s1, s1, s2
; GFX8-NEXT: v_readfirstlane_b32 s5, v0
; GFX8-NEXT: s_add_u32 s0, s0, s5
; GFX8-NEXT: s_add_u32 s1, s1, s0
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s4, s0, s2
; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s3
; GFX9-NEXT: s_add_u32 s0, s0, s5
; GFX9-NEXT: s_mul_i32 s1, s1, s2
; GFX9-NEXT: s_add_u32 s1, s1, s0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2
; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2
; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i64 %num, %den
ret i64 %result
}
define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN-LABEL: v_mul_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, v0
; GCN-NEXT: v_mov_b32_e32 v5, v1
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0
; GFX10-NEXT: v_mul_lo_u32 v3, v4, v3
; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2
; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3
; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2
; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
}
define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX7-LABEL: s_mul_i96:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s4
; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1
; GFX7-NEXT: s_mul_i32 s5, s0, s5
; GFX7-NEXT: v_readfirstlane_b32 s7, v0
; GFX7-NEXT: s_mul_i32 s8, s1, s4
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: s_add_u32 s5, s8, s5
; GFX7-NEXT: s_mul_i32 s2, s2, s3
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s3
; GFX7-NEXT: s_mul_i32 s6, s0, s3
; GFX7-NEXT: s_add_u32 s2, s2, s5
; GFX7-NEXT: s_mul_i32 s0, s0, s4
; GFX7-NEXT: v_readfirstlane_b32 s4, v1
; GFX7-NEXT: s_add_u32 s0, s0, s7
; GFX7-NEXT: s_addc_u32 s2, s4, s2
; GFX7-NEXT: s_mul_i32 s1, s1, s3
; GFX7-NEXT: v_readfirstlane_b32 s3, v0
; GFX7-NEXT: s_add_u32 s1, s1, s0
; GFX7-NEXT: s_addc_u32 s2, s3, s2
; GFX7-NEXT: s_mov_b32 s0, s6
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i96:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1
; GFX8-NEXT: s_mul_i32 s5, s0, s5
; GFX8-NEXT: v_readfirstlane_b32 s7, v0
; GFX8-NEXT: s_mul_i32 s8, s1, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: s_add_u32 s5, s8, s5
; GFX8-NEXT: s_mul_i32 s2, s2, s3
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s3
; GFX8-NEXT: s_mul_i32 s6, s0, s3
; GFX8-NEXT: s_add_u32 s2, s2, s5
; GFX8-NEXT: s_mul_i32 s0, s0, s4
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: s_add_u32 s0, s0, s7
; GFX8-NEXT: s_addc_u32 s2, s4, s2
; GFX8-NEXT: s_mul_i32 s1, s1, s3
; GFX8-NEXT: v_readfirstlane_b32 s3, v0
; GFX8-NEXT: s_add_u32 s1, s1, s0
; GFX8-NEXT: s_addc_u32 s2, s3, s2
; GFX8-NEXT: s_mov_b32 s0, s6
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i96:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s5, s0, s5
; GFX9-NEXT: s_mul_i32 s8, s1, s4
; GFX9-NEXT: s_add_u32 s5, s8, s5
; GFX9-NEXT: s_mul_i32 s2, s2, s3
; GFX9-NEXT: s_mul_hi_u32 s7, s0, s3
; GFX9-NEXT: s_add_u32 s2, s2, s5
; GFX9-NEXT: s_mul_i32 s5, s0, s4
; GFX9-NEXT: s_mul_i32 s6, s0, s3
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4
; GFX9-NEXT: s_add_u32 s4, s5, s7
; GFX9-NEXT: s_addc_u32 s0, s0, s2
; GFX9-NEXT: s_mul_i32 s2, s1, s3
; GFX9-NEXT: s_mul_hi_u32 s3, s1, s3
; GFX9-NEXT: s_add_u32 s1, s2, s4
; GFX9-NEXT: s_addc_u32 s2, s3, s0
; GFX9-NEXT: s_mov_b32 s0, s6
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i96:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_mul_i32 s6, s0, s5
; GFX10PLUS-NEXT: s_mul_i32 s7, s1, s4
; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s3
; GFX10PLUS-NEXT: s_add_i32 s6, s6, s7
; GFX10PLUS-NEXT: s_mul_hi_u32 s7, s0, s3
; GFX10PLUS-NEXT: s_add_i32 s6, s6, s2
; GFX10PLUS-NEXT: s_mul_i32 s2, s0, s4
; GFX10PLUS-NEXT: s_mul_i32 s5, s0, s3
; GFX10PLUS-NEXT: s_mul_hi_u32 s0, s0, s4
; GFX10PLUS-NEXT: s_add_u32 s2, s2, s7
; GFX10PLUS-NEXT: s_mul_i32 s4, s1, s3
; GFX10PLUS-NEXT: s_addc_u32 s0, s0, s6
; GFX10PLUS-NEXT: s_mul_hi_u32 s3, s1, s3
; GFX10PLUS-NEXT: s_add_u32 s1, s4, s2
; GFX10PLUS-NEXT: s_addc_u32 s2, s3, s0
; GFX10PLUS-NEXT: s_mov_b32 s0, s5
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i96 %num, %den
%cast = bitcast i96 %result to <3 x i32>
ret <3 x i32> %cast
}
define i96 @v_mul_i96(i96 %num, i96 %den) {
; GCN-LABEL: v_mul_i96:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v6, v0
; GCN-NEXT: v_mov_b32_e32 v7, v1
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
; GCN-NEXT: v_mov_b32_e32 v2, v8
; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: v_mov_b32_e32 v7, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX10-NEXT: v_mul_lo_u32 v5, v6, v5
; GFX10-NEXT: v_mul_lo_u32 v8, v7, v4
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0
; GFX10-NEXT: v_add3_u32 v2, v5, v8, v2
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i96:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
; GFX11-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX11-NEXT: v_mul_lo_u32 v5, v6, v5
; GFX11-NEXT: v_mul_lo_u32 v8, v7, v4
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0
; GFX11-NEXT: v_add3_u32 v2, v5, v8, v2
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
}
define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX7-LABEL: s_mul_i128:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1
; GFX7-NEXT: s_mul_i32 s10, s0, s6
; GFX7-NEXT: v_readfirstlane_b32 s9, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s13, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX7-NEXT: s_mul_i32 s12, s1, s5
; GFX7-NEXT: v_readfirstlane_b32 s11, v0
; GFX7-NEXT: s_add_u32 s10, s12, s10
; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: s_addc_u32 s11, s13, s11
; GFX7-NEXT: s_mul_i32 s12, s2, s4
; GFX7-NEXT: v_readfirstlane_b32 s13, v2
; GFX7-NEXT: s_add_u32 s10, s12, s10
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s4
; GFX7-NEXT: s_addc_u32 s11, s13, s11
; GFX7-NEXT: s_mul_i32 s12, s0, s5
; GFX7-NEXT: v_readfirstlane_b32 s13, v1
; GFX7-NEXT: s_add_u32 s9, s12, s9
; GFX7-NEXT: s_addc_u32 s10, s13, s10
; GFX7-NEXT: s_mul_i32 s13, s1, s4
; GFX7-NEXT: s_cselect_b32 s12, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s14, v0
; GFX7-NEXT: s_add_u32 s9, s13, s9
; GFX7-NEXT: s_mul_i32 s8, s0, s4
; GFX7-NEXT: s_addc_u32 s10, s14, s10
; GFX7-NEXT: s_mul_i32 s0, s0, s7
; GFX7-NEXT: s_addc_u32 s0, s11, s0
; GFX7-NEXT: s_mul_i32 s1, s1, s6
; GFX7-NEXT: s_cmp_lg_u32 s12, 0
; GFX7-NEXT: s_addc_u32 s0, s0, s1
; GFX7-NEXT: s_mul_i32 s2, s2, s5
; GFX7-NEXT: s_add_u32 s0, s2, s0
; GFX7-NEXT: s_mul_i32 s3, s3, s4
; GFX7-NEXT: s_add_u32 s3, s3, s0
; GFX7-NEXT: s_mov_b32 s0, s8
; GFX7-NEXT: s_mov_b32 s1, s9
; GFX7-NEXT: s_mov_b32 s2, s10
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1
; GFX8-NEXT: s_mul_i32 s10, s0, s6
; GFX8-NEXT: v_readfirstlane_b32 s9, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s13, v2
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX8-NEXT: s_mul_i32 s12, s1, s5
; GFX8-NEXT: v_readfirstlane_b32 s11, v0
; GFX8-NEXT: s_add_u32 s10, s12, s10
; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: s_addc_u32 s11, s13, s11
; GFX8-NEXT: s_mul_i32 s12, s2, s4
; GFX8-NEXT: v_readfirstlane_b32 s13, v2
; GFX8-NEXT: s_add_u32 s10, s12, s10
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s4
; GFX8-NEXT: s_addc_u32 s11, s13, s11
; GFX8-NEXT: s_mul_i32 s12, s0, s5
; GFX8-NEXT: v_readfirstlane_b32 s13, v1
; GFX8-NEXT: s_add_u32 s9, s12, s9
; GFX8-NEXT: s_addc_u32 s10, s13, s10
; GFX8-NEXT: s_mul_i32 s13, s1, s4
; GFX8-NEXT: s_cselect_b32 s12, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s14, v0
; GFX8-NEXT: s_add_u32 s9, s13, s9
; GFX8-NEXT: s_mul_i32 s8, s0, s4
; GFX8-NEXT: s_addc_u32 s10, s14, s10
; GFX8-NEXT: s_mul_i32 s0, s0, s7
; GFX8-NEXT: s_addc_u32 s0, s11, s0
; GFX8-NEXT: s_mul_i32 s1, s1, s6
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
; GFX8-NEXT: s_addc_u32 s0, s0, s1
; GFX8-NEXT: s_mul_i32 s2, s2, s5
; GFX8-NEXT: s_add_u32 s0, s2, s0
; GFX8-NEXT: s_mul_i32 s3, s3, s4
; GFX8-NEXT: s_add_u32 s3, s3, s0
; GFX8-NEXT: s_mov_b32 s0, s8
; GFX8-NEXT: s_mov_b32 s1, s9
; GFX8-NEXT: s_mov_b32 s2, s10
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s10, s0, s6
; GFX9-NEXT: s_mul_i32 s12, s1, s5
; GFX9-NEXT: s_mul_hi_u32 s11, s0, s6
; GFX9-NEXT: s_mul_hi_u32 s13, s1, s5
; GFX9-NEXT: s_add_u32 s10, s12, s10
; GFX9-NEXT: s_addc_u32 s11, s13, s11
; GFX9-NEXT: s_mul_i32 s12, s2, s4
; GFX9-NEXT: s_mul_hi_u32 s13, s2, s4
; GFX9-NEXT: s_add_u32 s10, s12, s10
; GFX9-NEXT: s_mul_hi_u32 s9, s0, s4
; GFX9-NEXT: s_addc_u32 s11, s13, s11
; GFX9-NEXT: s_mul_i32 s12, s0, s5
; GFX9-NEXT: s_mul_hi_u32 s13, s0, s5
; GFX9-NEXT: s_add_u32 s9, s12, s9
; GFX9-NEXT: s_addc_u32 s10, s13, s10
; GFX9-NEXT: s_mul_i32 s13, s1, s4
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4
; GFX9-NEXT: s_add_u32 s9, s13, s9
; GFX9-NEXT: s_mul_i32 s8, s0, s4
; GFX9-NEXT: s_addc_u32 s10, s14, s10
; GFX9-NEXT: s_mul_i32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s0, s11, s0
; GFX9-NEXT: s_mul_i32 s1, s1, s6
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
; GFX9-NEXT: s_addc_u32 s0, s0, s1
; GFX9-NEXT: s_mul_i32 s2, s2, s5
; GFX9-NEXT: s_add_u32 s0, s2, s0
; GFX9-NEXT: s_mul_i32 s3, s3, s4
; GFX9-NEXT: s_add_u32 s3, s3, s0
; GFX9-NEXT: s_mov_b32 s0, s8
; GFX9-NEXT: s_mov_b32 s1, s9
; GFX9-NEXT: s_mov_b32 s2, s10
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i128:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_mul_i32 s9, s0, s6
; GFX10PLUS-NEXT: s_mul_i32 s11, s1, s5
; GFX10PLUS-NEXT: s_mul_hi_u32 s10, s0, s6
; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s1, s5
; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9
; GFX10PLUS-NEXT: s_mul_i32 s11, s2, s4
; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10
; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s2, s4
; GFX10PLUS-NEXT: s_mul_hi_u32 s8, s0, s4
; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9
; GFX10PLUS-NEXT: s_mul_i32 s11, s0, s5
; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10
; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s0, s5
; GFX10PLUS-NEXT: s_add_u32 s8, s11, s8
; GFX10PLUS-NEXT: s_addc_u32 s9, s12, s9
; GFX10PLUS-NEXT: s_mul_i32 s12, s1, s4
; GFX10PLUS-NEXT: s_mul_hi_u32 s13, s1, s4
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s8, s12, s8
; GFX10PLUS-NEXT: s_mul_i32 s12, s0, s7
; GFX10PLUS-NEXT: s_addc_u32 s7, s13, s9
; GFX10PLUS-NEXT: s_addc_u32 s9, s10, s12
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s6
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s5
; GFX10PLUS-NEXT: s_addc_u32 s1, s9, s1
; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s4
; GFX10PLUS-NEXT: s_add_i32 s1, s1, s2
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s4
; GFX10PLUS-NEXT: s_add_i32 s3, s1, s3
; GFX10PLUS-NEXT: s_mov_b32 s1, s8
; GFX10PLUS-NEXT: s_mov_b32 s2, s7
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i128 %num, %den
%cast = bitcast i128 %result to <4 x i32>
ret <4 x i32> %cast
}
define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-LABEL: v_mul_i128:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v8, v0
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX7-NEXT: v_mov_b32_e32 v10, v2
; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7
; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX7-NEXT: v_mov_b32_e32 v2, v11
; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v8, v0
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX8-NEXT: v_mov_b32_e32 v10, v2
; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7
; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX8-NEXT: v_mov_b32_e32 v2, v11
; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v8, v0
; GFX9-NEXT: v_mov_b32_e32 v9, v1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX9-NEXT: v_mov_b32_e32 v10, v2
; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7
; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX9-NEXT: v_mov_b32_e32 v2, v11
; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, v0
; GFX10-NEXT: v_mov_b32_e32 v9, v1
; GFX10-NEXT: v_mov_b32_e32 v10, v2
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0
; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
; GFX10-NEXT: v_mov_b32_e32 v2, v11
; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v5, v10, v5
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
; GFX10-NEXT: v_add3_u32 v3, v4, v5, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i128:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
; GFX11-NEXT: v_mov_b32_e32 v10, v2
; GFX11-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0
; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7
; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v9, v5, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0
; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v10, v4, v[11:12]
; GFX11-NEXT: v_mov_b32_e32 v2, v11
; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
; GFX11-NEXT: v_mul_lo_u32 v5, v10, v5
; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v4, v[1:2]
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo
; GFX11-NEXT: v_add3_u32 v3, v4, v5, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
}
define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-LABEL: s_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s16, s0
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s9
; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1
; GFX7-NEXT: v_mul_hi_u32 v1, s16, v1
; GFX7-NEXT: v_readfirstlane_b32 s17, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s10
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT: v_readfirstlane_b32 s21, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_readfirstlane_b32 s23, v1
; GFX7-NEXT: v_readfirstlane_b32 s19, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8
; GFX7-NEXT: v_mul_hi_u32 v3, v2, s8
; GFX7-NEXT: v_mov_b32_e32 v4, s11
; GFX7-NEXT: s_mul_i32 s18, s16, s10
; GFX7-NEXT: v_readfirstlane_b32 s24, v1
; GFX7-NEXT: v_mov_b32_e32 v1, s12
; GFX7-NEXT: v_readfirstlane_b32 s22, v3
; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1
; GFX7-NEXT: s_mul_i32 s20, s1, s9
; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4
; GFX7-NEXT: s_add_u32 s18, s20, s18
; GFX7-NEXT: v_readfirstlane_b32 s25, v3
; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10
; GFX7-NEXT: s_addc_u32 s19, s21, s19
; GFX7-NEXT: s_mul_i32 s21, s2, s8
; GFX7-NEXT: s_cselect_b32 s20, 1, 0
; GFX7-NEXT: s_add_u32 s18, s21, s18
; GFX7-NEXT: v_readfirstlane_b32 s28, v3
; GFX7-NEXT: v_mov_b32_e32 v3, s3
; GFX7-NEXT: s_addc_u32 s19, s22, s19
; GFX7-NEXT: s_mul_i32 s22, s16, s9
; GFX7-NEXT: v_readfirstlane_b32 s27, v5
; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9
; GFX7-NEXT: s_cselect_b32 s21, 1, 0
; GFX7-NEXT: s_add_u32 s17, s22, s17
; GFX7-NEXT: s_addc_u32 s18, s23, s18
; GFX7-NEXT: s_mul_i32 s23, s1, s8
; GFX7-NEXT: s_cselect_b32 s22, 1, 0
; GFX7-NEXT: s_add_u32 s17, s23, s17
; GFX7-NEXT: s_addc_u32 s18, s24, s18
; GFX7-NEXT: s_mul_i32 s24, s16, s12
; GFX7-NEXT: s_mul_i32 s26, s1, s11
; GFX7-NEXT: v_readfirstlane_b32 s29, v5
; GFX7-NEXT: v_mov_b32_e32 v5, s4
; GFX7-NEXT: s_cselect_b32 s23, 1, 0
; GFX7-NEXT: s_add_u32 s24, s26, s24
; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8
; GFX7-NEXT: s_addc_u32 s25, s27, s25
; GFX7-NEXT: s_mul_i32 s27, s2, s10
; GFX7-NEXT: s_cselect_b32 s26, 1, 0
; GFX7-NEXT: s_add_u32 s24, s27, s24
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10
; GFX7-NEXT: s_addc_u32 s25, s28, s25
; GFX7-NEXT: s_mul_i32 s28, s3, s9
; GFX7-NEXT: s_cselect_b32 s27, 1, 0
; GFX7-NEXT: s_add_u32 s24, s28, s24
; GFX7-NEXT: v_readfirstlane_b32 s30, v6
; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4
; GFX7-NEXT: s_addc_u32 s25, s29, s25
; GFX7-NEXT: s_mul_i32 s29, s4, s8
; GFX7-NEXT: s_cselect_b32 s28, 1, 0
; GFX7-NEXT: s_add_u32 s24, s29, s24
; GFX7-NEXT: v_readfirstlane_b32 s33, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9
; GFX7-NEXT: s_addc_u32 s25, s30, s25
; GFX7-NEXT: s_mul_i32 s30, s16, s11
; GFX7-NEXT: s_cselect_b32 s29, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s31, v6
; GFX7-NEXT: s_add_u32 s19, s30, s19
; GFX7-NEXT: s_addc_u32 s24, s31, s24
; GFX7-NEXT: s_mul_i32 s31, s1, s10
; GFX7-NEXT: s_cselect_b32 s30, 1, 0
; GFX7-NEXT: s_add_u32 s19, s31, s19
; GFX7-NEXT: v_readfirstlane_b32 s34, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v3, s8
; GFX7-NEXT: s_addc_u32 s24, s33, s24
; GFX7-NEXT: s_mul_i32 s33, s2, s9
; GFX7-NEXT: s_cselect_b32 s31, 1, 0
; GFX7-NEXT: s_add_u32 s19, s33, s19
; GFX7-NEXT: s_addc_u32 s24, s34, s24
; GFX7-NEXT: s_mul_i32 s34, s3, s8
; GFX7-NEXT: s_cselect_b32 s33, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: s_add_u32 s19, s34, s19
; GFX7-NEXT: v_mov_b32_e32 v0, s14
; GFX7-NEXT: s_addc_u32 s24, s35, s24
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT: s_cselect_b32 s34, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s23, 0
; GFX7-NEXT: s_addc_u32 s19, s22, s19
; GFX7-NEXT: v_mov_b32_e32 v2, s13
; GFX7-NEXT: s_cselect_b32 s22, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s21, 0
; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2
; GFX7-NEXT: s_addc_u32 s20, s20, 0
; GFX7-NEXT: v_readfirstlane_b32 s23, v0
; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1
; GFX7-NEXT: s_cmp_lg_u32 s22, 0
; GFX7-NEXT: s_addc_u32 s20, s20, s24
; GFX7-NEXT: s_mul_i32 s22, s16, s14
; GFX7-NEXT: s_mul_i32 s24, s1, s13
; GFX7-NEXT: s_cselect_b32 s21, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
; GFX7-NEXT: s_add_u32 s22, s24, s22
; GFX7-NEXT: s_addc_u32 s23, s35, s23
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11
; GFX7-NEXT: s_mul_i32 s24, s2, s12
; GFX7-NEXT: s_add_u32 s22, s24, s22
; GFX7-NEXT: s_addc_u32 s23, s35, s23
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10
; GFX7-NEXT: s_mul_i32 s24, s3, s11
; GFX7-NEXT: s_add_u32 s22, s24, s22
; GFX7-NEXT: s_addc_u32 s23, s35, s23
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9
; GFX7-NEXT: s_mul_i32 s24, s4, s10
; GFX7-NEXT: s_add_u32 s22, s24, s22
; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX7-NEXT: s_addc_u32 s23, s35, s23
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
; GFX7-NEXT: v_mov_b32_e32 v6, s6
; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8
; GFX7-NEXT: s_mul_i32 s24, s5, s9
; GFX7-NEXT: s_add_u32 s22, s24, s22
; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2
; GFX7-NEXT: v_readfirstlane_b32 s36, v1
; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4
; GFX7-NEXT: s_addc_u32 s23, s35, s23
; GFX7-NEXT: s_mul_i32 s24, s6, s8
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
; GFX7-NEXT: s_add_u32 s22, s24, s22
; GFX7-NEXT: s_addc_u32 s23, s35, s23
; GFX7-NEXT: s_mul_i32 s24, s16, s13
; GFX7-NEXT: v_readfirstlane_b32 s35, v2
; GFX7-NEXT: s_add_u32 s24, s24, s25
; GFX7-NEXT: v_readfirstlane_b32 s37, v1
; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10
; GFX7-NEXT: s_addc_u32 s22, s35, s22
; GFX7-NEXT: s_mul_i32 s35, s1, s12
; GFX7-NEXT: s_cselect_b32 s25, 1, 0
; GFX7-NEXT: s_add_u32 s24, s35, s24
; GFX7-NEXT: s_addc_u32 s22, s36, s22
; GFX7-NEXT: s_mul_i32 s36, s2, s11
; GFX7-NEXT: s_cselect_b32 s35, 1, 0
; GFX7-NEXT: s_add_u32 s24, s36, s24
; GFX7-NEXT: v_readfirstlane_b32 s38, v1
; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9
; GFX7-NEXT: s_addc_u32 s22, s37, s22
; GFX7-NEXT: s_mul_i32 s37, s3, s10
; GFX7-NEXT: s_cselect_b32 s36, 1, 0
; GFX7-NEXT: s_add_u32 s24, s37, s24
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8
; GFX7-NEXT: s_addc_u32 s22, s38, s22
; GFX7-NEXT: s_mul_i32 s38, s4, s9
; GFX7-NEXT: s_cselect_b32 s37, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s39, v1
; GFX7-NEXT: s_add_u32 s24, s38, s24
; GFX7-NEXT: s_addc_u32 s22, s39, s22
; GFX7-NEXT: s_mul_i32 s39, s5, s8
; GFX7-NEXT: s_cselect_b32 s38, 1, 0
; GFX7-NEXT: v_readfirstlane_b32 s40, v0
; GFX7-NEXT: s_add_u32 s24, s39, s24
; GFX7-NEXT: s_addc_u32 s22, s40, s22
; GFX7-NEXT: s_cselect_b32 s39, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s31, 0
; GFX7-NEXT: s_addc_u32 s30, s30, 0
; GFX7-NEXT: s_cmp_lg_u32 s33, 0
; GFX7-NEXT: s_addc_u32 s30, s30, 0
; GFX7-NEXT: s_cmp_lg_u32 s34, 0
; GFX7-NEXT: s_addc_u32 s30, s30, 0
; GFX7-NEXT: s_cmp_lg_u32 s21, 0
; GFX7-NEXT: s_addc_u32 s21, s30, s24
; GFX7-NEXT: s_cselect_b32 s24, 1, 0
; GFX7-NEXT: s_cmp_lg_u32 s27, 0
; GFX7-NEXT: s_addc_u32 s26, s26, 0
; GFX7-NEXT: s_cmp_lg_u32 s28, 0
; GFX7-NEXT: s_addc_u32 s26, s26, 0
; GFX7-NEXT: s_cmp_lg_u32 s29, 0
; GFX7-NEXT: s_addc_u32 s26, s26, 0
; GFX7-NEXT: s_cmp_lg_u32 s24, 0
; GFX7-NEXT: s_addc_u32 s22, s26, s22
; GFX7-NEXT: s_mul_i32 s16, s16, s15
; GFX7-NEXT: s_addc_u32 s15, s23, s16
; GFX7-NEXT: s_mul_i32 s1, s1, s14
; GFX7-NEXT: s_cmp_lg_u32 s39, 0
; GFX7-NEXT: s_addc_u32 s1, s15, s1
; GFX7-NEXT: s_mul_i32 s2, s2, s13
; GFX7-NEXT: s_cmp_lg_u32 s38, 0
; GFX7-NEXT: s_addc_u32 s1, s1, s2
; GFX7-NEXT: s_mul_i32 s3, s3, s12
; GFX7-NEXT: s_cmp_lg_u32 s37, 0
; GFX7-NEXT: s_addc_u32 s1, s1, s3
; GFX7-NEXT: s_mul_i32 s4, s4, s11
; GFX7-NEXT: s_cmp_lg_u32 s36, 0
; GFX7-NEXT: s_addc_u32 s1, s1, s4
; GFX7-NEXT: s_mul_i32 s5, s5, s10
; GFX7-NEXT: s_cmp_lg_u32 s35, 0
; GFX7-NEXT: s_addc_u32 s1, s1, s5
; GFX7-NEXT: s_mul_i32 s6, s6, s9
; GFX7-NEXT: s_cmp_lg_u32 s25, 0
; GFX7-NEXT: s_addc_u32 s1, s1, s6
; GFX7-NEXT: s_mul_i32 s7, s7, s8
; GFX7-NEXT: s_mul_i32 s0, s0, s8
; GFX7-NEXT: s_add_u32 s7, s7, s1
; GFX7-NEXT: s_mov_b32 s1, s17
; GFX7-NEXT: s_mov_b32 s2, s18
; GFX7-NEXT: s_mov_b32 s3, s19
; GFX7-NEXT: s_mov_b32 s4, s20
; GFX7-NEXT: s_mov_b32 s5, s21
; GFX7-NEXT: s_mov_b32 s6, s22
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s16, s0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1
; GFX8-NEXT: v_mul_hi_u32 v1, s16, v1
; GFX8-NEXT: v_readfirstlane_b32 s17, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT: v_readfirstlane_b32 s21, v2
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_readfirstlane_b32 s23, v1
; GFX8-NEXT: v_readfirstlane_b32 s19, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8
; GFX8-NEXT: v_mul_hi_u32 v3, v2, s8
; GFX8-NEXT: v_mov_b32_e32 v4, s11
; GFX8-NEXT: s_mul_i32 s18, s16, s10
; GFX8-NEXT: v_readfirstlane_b32 s24, v1
; GFX8-NEXT: v_mov_b32_e32 v1, s12
; GFX8-NEXT: v_readfirstlane_b32 s22, v3
; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1
; GFX8-NEXT: s_mul_i32 s20, s1, s9
; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4
; GFX8-NEXT: s_add_u32 s18, s20, s18
; GFX8-NEXT: v_readfirstlane_b32 s25, v3
; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10
; GFX8-NEXT: s_addc_u32 s19, s21, s19
; GFX8-NEXT: s_mul_i32 s21, s2, s8
; GFX8-NEXT: s_cselect_b32 s20, 1, 0
; GFX8-NEXT: s_add_u32 s18, s21, s18
; GFX8-NEXT: v_readfirstlane_b32 s28, v3
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_addc_u32 s19, s22, s19
; GFX8-NEXT: s_mul_i32 s22, s16, s9
; GFX8-NEXT: v_readfirstlane_b32 s27, v5
; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
; GFX8-NEXT: s_add_u32 s17, s22, s17
; GFX8-NEXT: s_addc_u32 s18, s23, s18
; GFX8-NEXT: s_mul_i32 s23, s1, s8
; GFX8-NEXT: s_cselect_b32 s22, 1, 0
; GFX8-NEXT: s_add_u32 s17, s23, s17
; GFX8-NEXT: s_addc_u32 s18, s24, s18
; GFX8-NEXT: s_mul_i32 s24, s16, s12
; GFX8-NEXT: s_mul_i32 s26, s1, s11
; GFX8-NEXT: v_readfirstlane_b32 s29, v5
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: s_cselect_b32 s23, 1, 0
; GFX8-NEXT: s_add_u32 s24, s26, s24
; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8
; GFX8-NEXT: s_addc_u32 s25, s27, s25
; GFX8-NEXT: s_mul_i32 s27, s2, s10
; GFX8-NEXT: s_cselect_b32 s26, 1, 0
; GFX8-NEXT: s_add_u32 s24, s27, s24
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10
; GFX8-NEXT: s_addc_u32 s25, s28, s25
; GFX8-NEXT: s_mul_i32 s28, s3, s9
; GFX8-NEXT: s_cselect_b32 s27, 1, 0
; GFX8-NEXT: s_add_u32 s24, s28, s24
; GFX8-NEXT: v_readfirstlane_b32 s30, v6
; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4
; GFX8-NEXT: s_addc_u32 s25, s29, s25
; GFX8-NEXT: s_mul_i32 s29, s4, s8
; GFX8-NEXT: s_cselect_b32 s28, 1, 0
; GFX8-NEXT: s_add_u32 s24, s29, s24
; GFX8-NEXT: v_readfirstlane_b32 s33, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9
; GFX8-NEXT: s_addc_u32 s25, s30, s25
; GFX8-NEXT: s_mul_i32 s30, s16, s11
; GFX8-NEXT: s_cselect_b32 s29, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s31, v6
; GFX8-NEXT: s_add_u32 s19, s30, s19
; GFX8-NEXT: s_addc_u32 s24, s31, s24
; GFX8-NEXT: s_mul_i32 s31, s1, s10
; GFX8-NEXT: s_cselect_b32 s30, 1, 0
; GFX8-NEXT: s_add_u32 s19, s31, s19
; GFX8-NEXT: v_readfirstlane_b32 s34, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v3, s8
; GFX8-NEXT: s_addc_u32 s24, s33, s24
; GFX8-NEXT: s_mul_i32 s33, s2, s9
; GFX8-NEXT: s_cselect_b32 s31, 1, 0
; GFX8-NEXT: s_add_u32 s19, s33, s19
; GFX8-NEXT: s_addc_u32 s24, s34, s24
; GFX8-NEXT: s_mul_i32 s34, s3, s8
; GFX8-NEXT: s_cselect_b32 s33, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
; GFX8-NEXT: s_add_u32 s19, s34, s19
; GFX8-NEXT: v_mov_b32_e32 v0, s14
; GFX8-NEXT: s_addc_u32 s24, s35, s24
; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT: s_cselect_b32 s34, 1, 0
; GFX8-NEXT: s_cmp_lg_u32 s23, 0
; GFX8-NEXT: s_addc_u32 s19, s22, s19
; GFX8-NEXT: v_mov_b32_e32 v2, s13
; GFX8-NEXT: s_cselect_b32 s22, 1, 0
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2
; GFX8-NEXT: s_addc_u32 s20, s20, 0
; GFX8-NEXT: v_readfirstlane_b32 s23, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1
; GFX8-NEXT: s_cmp_lg_u32 s22, 0
; GFX8-NEXT: s_addc_u32 s20, s20, s24
; GFX8-NEXT: s_mul_i32 s22, s16, s14
; GFX8-NEXT: s_mul_i32 s24, s1, s13
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
; GFX8-NEXT: s_add_u32 s22, s24, s22
; GFX8-NEXT: s_addc_u32 s23, s35, s23
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11
; GFX8-NEXT: s_mul_i32 s24, s2, s12
; GFX8-NEXT: s_add_u32 s22, s24, s22
; GFX8-NEXT: s_addc_u32 s23, s35, s23
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10
; GFX8-NEXT: s_mul_i32 s24, s3, s11
; GFX8-NEXT: s_add_u32 s22, s24, s22
; GFX8-NEXT: s_addc_u32 s23, s35, s23
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9
; GFX8-NEXT: s_mul_i32 s24, s4, s10
; GFX8-NEXT: s_add_u32 s22, s24, s22
; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX8-NEXT: s_addc_u32 s23, s35, s23
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8
; GFX8-NEXT: s_mul_i32 s24, s5, s9
; GFX8-NEXT: s_add_u32 s22, s24, s22
; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2
; GFX8-NEXT: v_readfirstlane_b32 s36, v1
; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4
; GFX8-NEXT: s_addc_u32 s23, s35, s23
; GFX8-NEXT: s_mul_i32 s24, s6, s8
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
; GFX8-NEXT: s_add_u32 s22, s24, s22
; GFX8-NEXT: s_addc_u32 s23, s35, s23
; GFX8-NEXT: s_mul_i32 s24, s16, s13
; GFX8-NEXT: v_readfirstlane_b32 s35, v2
; GFX8-NEXT: s_add_u32 s24, s24, s25
; GFX8-NEXT: v_readfirstlane_b32 s37, v1
; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10
; GFX8-NEXT: s_addc_u32 s22, s35, s22
; GFX8-NEXT: s_mul_i32 s35, s1, s12
; GFX8-NEXT: s_cselect_b32 s25, 1, 0
; GFX8-NEXT: s_add_u32 s24, s35, s24
; GFX8-NEXT: s_addc_u32 s22, s36, s22
; GFX8-NEXT: s_mul_i32 s36, s2, s11
; GFX8-NEXT: s_cselect_b32 s35, 1, 0
; GFX8-NEXT: s_add_u32 s24, s36, s24
; GFX8-NEXT: v_readfirstlane_b32 s38, v1
; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9
; GFX8-NEXT: s_addc_u32 s22, s37, s22
; GFX8-NEXT: s_mul_i32 s37, s3, s10
; GFX8-NEXT: s_cselect_b32 s36, 1, 0
; GFX8-NEXT: s_add_u32 s24, s37, s24
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8
; GFX8-NEXT: s_addc_u32 s22, s38, s22
; GFX8-NEXT: s_mul_i32 s38, s4, s9
; GFX8-NEXT: s_cselect_b32 s37, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s39, v1
; GFX8-NEXT: s_add_u32 s24, s38, s24
; GFX8-NEXT: s_addc_u32 s22, s39, s22
; GFX8-NEXT: s_mul_i32 s39, s5, s8
; GFX8-NEXT: s_cselect_b32 s38, 1, 0
; GFX8-NEXT: v_readfirstlane_b32 s40, v0
; GFX8-NEXT: s_add_u32 s24, s39, s24
; GFX8-NEXT: s_addc_u32 s22, s40, s22
; GFX8-NEXT: s_cselect_b32 s39, 1, 0
; GFX8-NEXT: s_cmp_lg_u32 s31, 0
; GFX8-NEXT: s_addc_u32 s30, s30, 0
; GFX8-NEXT: s_cmp_lg_u32 s33, 0
; GFX8-NEXT: s_addc_u32 s30, s30, 0
; GFX8-NEXT: s_cmp_lg_u32 s34, 0
; GFX8-NEXT: s_addc_u32 s30, s30, 0
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
; GFX8-NEXT: s_addc_u32 s21, s30, s24
; GFX8-NEXT: s_cselect_b32 s24, 1, 0
; GFX8-NEXT: s_cmp_lg_u32 s27, 0
; GFX8-NEXT: s_addc_u32 s26, s26, 0
; GFX8-NEXT: s_cmp_lg_u32 s28, 0
; GFX8-NEXT: s_addc_u32 s26, s26, 0
; GFX8-NEXT: s_cmp_lg_u32 s29, 0
; GFX8-NEXT: s_addc_u32 s26, s26, 0
; GFX8-NEXT: s_cmp_lg_u32 s24, 0
; GFX8-NEXT: s_addc_u32 s22, s26, s22
; GFX8-NEXT: s_mul_i32 s16, s16, s15
; GFX8-NEXT: s_addc_u32 s15, s23, s16
; GFX8-NEXT: s_mul_i32 s1, s1, s14
; GFX8-NEXT: s_cmp_lg_u32 s39, 0
; GFX8-NEXT: s_addc_u32 s1, s15, s1
; GFX8-NEXT: s_mul_i32 s2, s2, s13
; GFX8-NEXT: s_cmp_lg_u32 s38, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s3, s3, s12
; GFX8-NEXT: s_cmp_lg_u32 s37, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s3
; GFX8-NEXT: s_mul_i32 s4, s4, s11
; GFX8-NEXT: s_cmp_lg_u32 s36, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s4
; GFX8-NEXT: s_mul_i32 s5, s5, s10
; GFX8-NEXT: s_cmp_lg_u32 s35, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s5
; GFX8-NEXT: s_mul_i32 s6, s6, s9
; GFX8-NEXT: s_cmp_lg_u32 s25, 0
; GFX8-NEXT: s_addc_u32 s1, s1, s6
; GFX8-NEXT: s_mul_i32 s7, s7, s8
; GFX8-NEXT: s_mul_i32 s0, s0, s8
; GFX8-NEXT: s_add_u32 s7, s7, s1
; GFX8-NEXT: s_mov_b32 s1, s17
; GFX8-NEXT: s_mov_b32 s2, s18
; GFX8-NEXT: s_mov_b32 s3, s19
; GFX8-NEXT: s_mov_b32 s4, s20
; GFX8-NEXT: s_mov_b32 s5, s21
; GFX8-NEXT: s_mov_b32 s6, s22
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s18, s0, s10
; GFX9-NEXT: s_mul_i32 s20, s1, s9
; GFX9-NEXT: s_mul_hi_u32 s19, s0, s10
; GFX9-NEXT: s_mul_hi_u32 s21, s1, s9
; GFX9-NEXT: s_add_u32 s18, s20, s18
; GFX9-NEXT: s_addc_u32 s19, s21, s19
; GFX9-NEXT: s_mul_i32 s21, s2, s8
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s22, s2, s8
; GFX9-NEXT: s_add_u32 s18, s21, s18
; GFX9-NEXT: s_mul_hi_u32 s17, s0, s8
; GFX9-NEXT: s_addc_u32 s19, s22, s19
; GFX9-NEXT: s_mul_i32 s22, s0, s9
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9
; GFX9-NEXT: s_add_u32 s17, s22, s17
; GFX9-NEXT: s_addc_u32 s18, s23, s18
; GFX9-NEXT: s_mul_i32 s23, s1, s8
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8
; GFX9-NEXT: s_add_u32 s17, s23, s17
; GFX9-NEXT: s_addc_u32 s18, s24, s18
; GFX9-NEXT: s_mul_i32 s24, s0, s12
; GFX9-NEXT: s_mul_i32 s26, s1, s11
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s25, s0, s12
; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11
; GFX9-NEXT: s_add_u32 s24, s26, s24
; GFX9-NEXT: s_addc_u32 s25, s27, s25
; GFX9-NEXT: s_mul_i32 s27, s2, s10
; GFX9-NEXT: s_cselect_b32 s26, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s28, s2, s10
; GFX9-NEXT: s_add_u32 s24, s27, s24
; GFX9-NEXT: s_addc_u32 s25, s28, s25
; GFX9-NEXT: s_mul_i32 s28, s3, s9
; GFX9-NEXT: s_cselect_b32 s27, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9
; GFX9-NEXT: s_add_u32 s24, s28, s24
; GFX9-NEXT: s_addc_u32 s25, s29, s25
; GFX9-NEXT: s_mul_i32 s29, s4, s8
; GFX9-NEXT: s_cselect_b32 s28, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s30, s4, s8
; GFX9-NEXT: s_add_u32 s24, s29, s24
; GFX9-NEXT: s_addc_u32 s25, s30, s25
; GFX9-NEXT: s_mul_i32 s30, s0, s11
; GFX9-NEXT: s_cselect_b32 s29, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s31, s0, s11
; GFX9-NEXT: s_add_u32 s19, s30, s19
; GFX9-NEXT: s_addc_u32 s24, s31, s24
; GFX9-NEXT: s_mul_i32 s31, s1, s10
; GFX9-NEXT: s_cselect_b32 s30, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s33, s1, s10
; GFX9-NEXT: s_add_u32 s19, s31, s19
; GFX9-NEXT: s_addc_u32 s24, s33, s24
; GFX9-NEXT: s_mul_i32 s33, s2, s9
; GFX9-NEXT: s_cselect_b32 s31, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s34, s2, s9
; GFX9-NEXT: s_add_u32 s19, s33, s19
; GFX9-NEXT: s_addc_u32 s24, s34, s24
; GFX9-NEXT: s_mul_i32 s34, s3, s8
; GFX9-NEXT: s_cselect_b32 s33, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s35, s3, s8
; GFX9-NEXT: s_add_u32 s19, s34, s19
; GFX9-NEXT: s_addc_u32 s24, s35, s24
; GFX9-NEXT: s_cselect_b32 s34, 1, 0
; GFX9-NEXT: s_cmp_lg_u32 s23, 0
; GFX9-NEXT: s_addc_u32 s19, s22, s19
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
; GFX9-NEXT: s_addc_u32 s20, s20, 0
; GFX9-NEXT: s_cmp_lg_u32 s22, 0
; GFX9-NEXT: s_addc_u32 s20, s20, s24
; GFX9-NEXT: s_mul_i32 s22, s0, s14
; GFX9-NEXT: s_mul_i32 s24, s1, s13
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s23, s0, s14
; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13
; GFX9-NEXT: s_add_u32 s22, s24, s22
; GFX9-NEXT: s_addc_u32 s23, s35, s23
; GFX9-NEXT: s_mul_i32 s24, s2, s12
; GFX9-NEXT: s_mul_hi_u32 s35, s2, s12
; GFX9-NEXT: s_add_u32 s22, s24, s22
; GFX9-NEXT: s_addc_u32 s23, s35, s23
; GFX9-NEXT: s_mul_i32 s24, s3, s11
; GFX9-NEXT: s_mul_hi_u32 s35, s3, s11
; GFX9-NEXT: s_add_u32 s22, s24, s22
; GFX9-NEXT: s_addc_u32 s23, s35, s23
; GFX9-NEXT: s_mul_i32 s24, s4, s10
; GFX9-NEXT: s_mul_hi_u32 s35, s4, s10
; GFX9-NEXT: s_add_u32 s22, s24, s22
; GFX9-NEXT: s_addc_u32 s23, s35, s23
; GFX9-NEXT: s_mul_i32 s24, s5, s9
; GFX9-NEXT: s_mul_hi_u32 s35, s5, s9
; GFX9-NEXT: s_add_u32 s22, s24, s22
; GFX9-NEXT: s_addc_u32 s23, s35, s23
; GFX9-NEXT: s_mul_i32 s24, s6, s8
; GFX9-NEXT: s_mul_hi_u32 s35, s6, s8
; GFX9-NEXT: s_add_u32 s22, s24, s22
; GFX9-NEXT: s_addc_u32 s23, s35, s23
; GFX9-NEXT: s_mul_i32 s24, s0, s13
; GFX9-NEXT: s_mul_hi_u32 s35, s0, s13
; GFX9-NEXT: s_add_u32 s24, s24, s25
; GFX9-NEXT: s_addc_u32 s22, s35, s22
; GFX9-NEXT: s_mul_i32 s35, s1, s12
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s36, s1, s12
; GFX9-NEXT: s_add_u32 s24, s35, s24
; GFX9-NEXT: s_addc_u32 s22, s36, s22
; GFX9-NEXT: s_mul_i32 s36, s2, s11
; GFX9-NEXT: s_cselect_b32 s35, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s37, s2, s11
; GFX9-NEXT: s_add_u32 s24, s36, s24
; GFX9-NEXT: s_addc_u32 s22, s37, s22
; GFX9-NEXT: s_mul_i32 s37, s3, s10
; GFX9-NEXT: s_cselect_b32 s36, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s38, s3, s10
; GFX9-NEXT: s_add_u32 s24, s37, s24
; GFX9-NEXT: s_addc_u32 s22, s38, s22
; GFX9-NEXT: s_mul_i32 s38, s4, s9
; GFX9-NEXT: s_cselect_b32 s37, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s39, s4, s9
; GFX9-NEXT: s_add_u32 s24, s38, s24
; GFX9-NEXT: s_addc_u32 s22, s39, s22
; GFX9-NEXT: s_mul_i32 s39, s5, s8
; GFX9-NEXT: s_cselect_b32 s38, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s40, s5, s8
; GFX9-NEXT: s_add_u32 s24, s39, s24
; GFX9-NEXT: s_addc_u32 s22, s40, s22
; GFX9-NEXT: s_cselect_b32 s39, 1, 0
; GFX9-NEXT: s_cmp_lg_u32 s31, 0
; GFX9-NEXT: s_addc_u32 s30, s30, 0
; GFX9-NEXT: s_cmp_lg_u32 s33, 0
; GFX9-NEXT: s_addc_u32 s30, s30, 0
; GFX9-NEXT: s_cmp_lg_u32 s34, 0
; GFX9-NEXT: s_addc_u32 s30, s30, 0
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
; GFX9-NEXT: s_addc_u32 s21, s30, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_cmp_lg_u32 s27, 0
; GFX9-NEXT: s_addc_u32 s26, s26, 0
; GFX9-NEXT: s_cmp_lg_u32 s28, 0
; GFX9-NEXT: s_addc_u32 s26, s26, 0
; GFX9-NEXT: s_cmp_lg_u32 s29, 0
; GFX9-NEXT: s_addc_u32 s26, s26, 0
; GFX9-NEXT: s_cmp_lg_u32 s24, 0
; GFX9-NEXT: s_mul_i32 s16, s0, s8
; GFX9-NEXT: s_addc_u32 s22, s26, s22
; GFX9-NEXT: s_mul_i32 s0, s0, s15
; GFX9-NEXT: s_addc_u32 s0, s23, s0
; GFX9-NEXT: s_mul_i32 s1, s1, s14
; GFX9-NEXT: s_cmp_lg_u32 s39, 0
; GFX9-NEXT: s_addc_u32 s0, s0, s1
; GFX9-NEXT: s_mul_i32 s2, s2, s13
; GFX9-NEXT: s_cmp_lg_u32 s38, 0
; GFX9-NEXT: s_addc_u32 s0, s0, s2
; GFX9-NEXT: s_mul_i32 s3, s3, s12
; GFX9-NEXT: s_cmp_lg_u32 s37, 0
; GFX9-NEXT: s_addc_u32 s0, s0, s3
; GFX9-NEXT: s_mul_i32 s4, s4, s11
; GFX9-NEXT: s_cmp_lg_u32 s36, 0
; GFX9-NEXT: s_addc_u32 s0, s0, s4
; GFX9-NEXT: s_mul_i32 s5, s5, s10
; GFX9-NEXT: s_cmp_lg_u32 s35, 0
; GFX9-NEXT: s_addc_u32 s0, s0, s5
; GFX9-NEXT: s_mul_i32 s6, s6, s9
; GFX9-NEXT: s_cmp_lg_u32 s25, 0
; GFX9-NEXT: s_addc_u32 s0, s0, s6
; GFX9-NEXT: s_mul_i32 s7, s7, s8
; GFX9-NEXT: s_add_u32 s7, s7, s0
; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: s_mov_b32 s1, s17
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s4, s20
; GFX9-NEXT: s_mov_b32 s5, s21
; GFX9-NEXT: s_mov_b32 s6, s22
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_mul_i256:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_mul_i32 s17, s0, s10
; GFX10PLUS-NEXT: s_mul_i32 s19, s1, s9
; GFX10PLUS-NEXT: s_mul_hi_u32 s18, s0, s10
; GFX10PLUS-NEXT: s_mul_hi_u32 s20, s1, s9
; GFX10PLUS-NEXT: s_add_u32 s17, s19, s17
; GFX10PLUS-NEXT: s_addc_u32 s18, s20, s18
; GFX10PLUS-NEXT: s_mul_i32 s20, s2, s8
; GFX10PLUS-NEXT: s_mul_hi_u32 s21, s2, s8
; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s17, s20, s17
; GFX10PLUS-NEXT: s_mul_hi_u32 s16, s0, s8
; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18
; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s9
; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s9
; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s16, s21, s16
; GFX10PLUS-NEXT: s_addc_u32 s17, s22, s17
; GFX10PLUS-NEXT: s_mul_i32 s22, s1, s8
; GFX10PLUS-NEXT: s_mul_hi_u32 s23, s1, s8
; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s16, s22, s16
; GFX10PLUS-NEXT: s_addc_u32 s17, s23, s17
; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s12
; GFX10PLUS-NEXT: s_mul_i32 s25, s1, s11
; GFX10PLUS-NEXT: s_mul_hi_u32 s24, s0, s12
; GFX10PLUS-NEXT: s_mul_hi_u32 s26, s1, s11
; GFX10PLUS-NEXT: s_cselect_b32 s22, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s25, s23
; GFX10PLUS-NEXT: s_addc_u32 s24, s26, s24
; GFX10PLUS-NEXT: s_mul_i32 s26, s2, s10
; GFX10PLUS-NEXT: s_mul_hi_u32 s27, s2, s10
; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s26, s23
; GFX10PLUS-NEXT: s_addc_u32 s24, s27, s24
; GFX10PLUS-NEXT: s_mul_i32 s27, s3, s9
; GFX10PLUS-NEXT: s_mul_hi_u32 s28, s3, s9
; GFX10PLUS-NEXT: s_cselect_b32 s26, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s27, s23
; GFX10PLUS-NEXT: s_addc_u32 s24, s28, s24
; GFX10PLUS-NEXT: s_mul_i32 s28, s4, s8
; GFX10PLUS-NEXT: s_mul_hi_u32 s29, s4, s8
; GFX10PLUS-NEXT: s_cselect_b32 s27, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s28, s23
; GFX10PLUS-NEXT: s_addc_u32 s24, s29, s24
; GFX10PLUS-NEXT: s_mul_i32 s29, s0, s11
; GFX10PLUS-NEXT: s_mul_hi_u32 s30, s0, s11
; GFX10PLUS-NEXT: s_cselect_b32 s28, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s18, s29, s18
; GFX10PLUS-NEXT: s_addc_u32 s23, s30, s23
; GFX10PLUS-NEXT: s_mul_i32 s30, s1, s10
; GFX10PLUS-NEXT: s_mul_hi_u32 s31, s1, s10
; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s18, s30, s18
; GFX10PLUS-NEXT: s_addc_u32 s23, s31, s23
; GFX10PLUS-NEXT: s_mul_i32 s31, s2, s9
; GFX10PLUS-NEXT: s_mul_hi_u32 s33, s2, s9
; GFX10PLUS-NEXT: s_cselect_b32 s30, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s18, s31, s18
; GFX10PLUS-NEXT: s_addc_u32 s23, s33, s23
; GFX10PLUS-NEXT: s_mul_i32 s33, s3, s8
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s8
; GFX10PLUS-NEXT: s_cselect_b32 s31, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18
; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23
; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0
; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14
; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18
; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13
; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0
; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14
; GFX10PLUS-NEXT: s_addc_u32 s19, s19, s23
; GFX10PLUS-NEXT: s_mul_i32 s23, s1, s13
; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT: s_mul_i32 s23, s2, s12
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s2, s12
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT: s_mul_i32 s23, s3, s11
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s11
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT: s_mul_i32 s23, s4, s10
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s4, s10
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT: s_mul_i32 s23, s5, s9
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s5, s9
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT: s_mul_i32 s23, s6, s8
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s6, s8
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s13
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s0, s13
; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24
; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21
; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12
; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12
; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23
; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21
; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11
; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11
; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23
; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21
; GFX10PLUS-NEXT: s_mul_i32 s36, s3, s10
; GFX10PLUS-NEXT: s_mul_hi_u32 s37, s3, s10
; GFX10PLUS-NEXT: s_cselect_b32 s35, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s36, s23
; GFX10PLUS-NEXT: s_addc_u32 s21, s37, s21
; GFX10PLUS-NEXT: s_mul_i32 s37, s4, s9
; GFX10PLUS-NEXT: s_mul_hi_u32 s38, s4, s9
; GFX10PLUS-NEXT: s_cselect_b32 s36, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s37, s23
; GFX10PLUS-NEXT: s_addc_u32 s21, s38, s21
; GFX10PLUS-NEXT: s_mul_i32 s38, s5, s8
; GFX10PLUS-NEXT: s_mul_hi_u32 s39, s5, s8
; GFX10PLUS-NEXT: s_cselect_b32 s37, 1, 0
; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23
; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21
; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14
; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0
; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13
; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0
; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12
; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0
; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11
; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23
; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0
; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15
; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0
; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10
; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0
; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9
; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0
; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8
; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21
; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26
; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8
; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1
; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2
; GFX10PLUS-NEXT: s_cmp_lg_u32 s36, 0
; GFX10PLUS-NEXT: s_mov_b32 s2, s17
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3
; GFX10PLUS-NEXT: s_cmp_lg_u32 s35, 0
; GFX10PLUS-NEXT: s_mov_b32 s3, s18
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s4
; GFX10PLUS-NEXT: s_cmp_lg_u32 s34, 0
; GFX10PLUS-NEXT: s_mov_b32 s4, s19
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5
; GFX10PLUS-NEXT: s_cmp_lg_u32 s24, 0
; GFX10PLUS-NEXT: s_mov_b32 s5, s20
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s6
; GFX10PLUS-NEXT: s_mov_b32 s6, s15
; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7
; GFX10PLUS-NEXT: s_mov_b32 s1, s16
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = mul i256 %num, %den
%cast = bitcast i256 %result to <8 x i32>
ret <8 x i32> %cast
}
define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11
; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc
; GFX7-NEXT: v_mov_b32_e32 v20, v18
; GFX7-NEXT: v_mov_b32_e32 v18, v19
; GFX7-NEXT: v_mov_b32_e32 v19, v16
; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
; GFX7-NEXT: v_mov_b32_e32 v19, v22
; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12
; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13
; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
; GFX7-NEXT: v_mov_b32_e32 v20, v11
; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14
; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11
; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc
; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc
; GFX8-NEXT: v_mov_b32_e32 v20, v18
; GFX8-NEXT: v_mov_b32_e32 v18, v19
; GFX8-NEXT: v_mov_b32_e32 v19, v16
; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
; GFX8-NEXT: v_mov_b32_e32 v19, v22
; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12
; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13
; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
; GFX8-NEXT: v_mov_b32_e32 v20, v11
; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11
; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc
; GFX9-NEXT: v_mov_b32_e32 v20, v18
; GFX9-NEXT: v_mov_b32_e32 v18, v19
; GFX9-NEXT: v_mov_b32_e32 v19, v16
; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5]
; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
; GFX9-NEXT: v_mov_b32_e32 v19, v22
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12
; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13
; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13]
; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13]
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13]
; GFX9-NEXT: v_mov_b32_e32 v20, v11
; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13]
; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13]
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15
; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13]
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13]
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v16, v0
; GFX10-NEXT: v_mov_b32_e32 v17, v1
; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9
; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10
; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0
; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0
; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mov_b32_e32 v20, v22
; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
; GFX10-NEXT: v_mov_b32_e32 v20, v18
; GFX10-NEXT: v_mov_b32_e32 v19, v22
; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15
; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0
; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13
; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v13, v1
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
; GFX10-NEXT: v_mov_b32_e32 v14, v21
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4
; GFX10-NEXT: v_add_nc_u32_e32 v7, v8, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i256:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
; GFX11-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9
; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0
; GFX11-NEXT: v_mad_u64_u32 v[18:19], null, v16, v12, 0
; GFX11-NEXT: v_mul_lo_u32 v30, v17, v14
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
; GFX11-NEXT: v_mov_b32_e32 v20, v22
; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX11-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20]
; GFX11-NEXT: v_mov_b32_e32 v20, v18
; GFX11-NEXT: v_mov_b32_e32 v19, v22
; GFX11-NEXT: v_mul_lo_u32 v22, v16, v15
; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v8, 0
; GFX11-NEXT: v_mul_lo_u32 v20, v4, v11
; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
; GFX11-NEXT: v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25]
; GFX11-NEXT: v_mul_lo_u32 v25, v3, v12
; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
; GFX11-NEXT: v_mov_b32_e32 v14, v21
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX11-NEXT: v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19]
; GFX11-NEXT: v_mul_lo_u32 v24, v2, v13
; GFX11-NEXT: v_mov_b32_e32 v13, v1
; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12]
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
; GFX11-NEXT: v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19]
; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14]
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2]
; GFX11-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11]
; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13]
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0
; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
}