Files
clang-p2996/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Matt Arsenault 4a36e96c3f RegAllocGreedy: Account for reserved registers in num regs heuristic
This simple heuristic uses the estimated live range length combined
with the number of registers in the class to switch which heuristic to
use. This was taking the raw number of registers in the class, even
though not all of them may be available. AMDGPU heavily relies on
dynamically reserved numbers of registers based on user attributes to
satisfy occupancy constraints, so the raw number is highly misleading.

There are still a few problems here. In the original testcase that
made me notice this, the live range size is incorrect after the
scheduler rearranges instructions, since the instructions don't have
the original InstrDist offsets. Additionally, I think it would be more
appropriate to use the number of disjointly allocatable registers in
the class. For the AMDGPU register tuples, there are a large number of
registers in each tuple class, but only a small fraction can actually
be allocated at the same time since they all overlap with each
other. It seems we do not have a query that corresponds to the number
of independently allocatable registers. Relatedly, I'm still debugging
some allocation failures where overlapping tuples seem to not be
handled correctly.

The test changes are mostly noise. There are a handful of x86 tests
that look like regressions with an additional spill, and a handful
that now avoid a spill. The worst looking regression is likely
test/Thumb2/mve-vld4.ll which introduces a few additional
spills. test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll
shows a massive improvement by completely eliminating a large number
of spills inside a loop.
2021-09-14 21:00:29 -04:00

2996 lines
119 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX7-LABEL: s_mul_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX7-LABEL: v_mul_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
; GFX7-LABEL: s_mul_i16_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_zeroext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX7-LABEL: v_mul_i16_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16_zeroext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
; GFX7-LABEL: s_mul_i16_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: s_sext_i32_i16 s0, s0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s0, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_sext_i32_i16 s0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_signext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_sext_i32_i16 s0, s0
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX7-LABEL: v_mul_i16_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16_signext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GCN-LABEL: s_mul_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mul_i32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
%result = mul i32 %num, %den
ret i32 %result
}
define i32 @v_mul_i32(i32 %num, i32 %den) {
; GCN-LABEL: v_mul_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
}
define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GCN-LABEL: s_mul_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mul_i32 s0, s0, s2
; GCN-NEXT: s_mul_i32 s1, s1, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_mul_i32 s1, s1, s3
; GFX10-NEXT: ; return to shader part epilog
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GCN-LABEL: v_mul_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
; GCN-NEXT: v_mul_lo_u32 v1, v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
; GFX7-LABEL: s_mul_i33:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s4, s0, s2
; GFX7-NEXT: s_mul_i32 s1, s1, s2
; GFX7-NEXT: s_mul_i32 s0, s0, s3
; GFX7-NEXT: s_add_i32 s1, s1, s0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i33:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s4, s0, s2
; GFX8-NEXT: s_mul_i32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s3
; GFX8-NEXT: s_add_i32 s1, s1, s0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i33:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s3, s0, s3
; GFX9-NEXT: s_mul_i32 s4, s0, s2
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s1, s1, s0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i33:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s3, s0, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: ; return to shader part epilog
%result = mul i33 %num, %den
ret i33 %result
}
define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX7-LABEL: s_mul_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s4, s0, s2
; GFX7-NEXT: s_mul_i32 s1, s1, s2
; GFX7-NEXT: s_mul_i32 s0, s0, s3
; GFX7-NEXT: s_add_i32 s1, s1, s0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s4, s0, s2
; GFX8-NEXT: s_mul_i32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s3
; GFX8-NEXT: s_add_i32 s1, s1, s0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s3, s0, s3
; GFX9-NEXT: s_mul_i32 s4, s0, s2
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s1, s1, s0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s3, s0, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: ; return to shader part epilog
%result = mul i64 %num, %den
ret i64 %result
}
define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX7-LABEL: v_mul_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX7-NEXT: v_mul_lo_u32 v4, v0, v3
; GFX7-NEXT: v_mul_lo_u32 v3, v0, v2
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX8-NEXT: v_mul_lo_u32 v4, v0, v3
; GFX8-NEXT: v_mul_lo_u32 v3, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX9-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v1, v1, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v2
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10-NEXT: v_add3_u32 v1, v1, v3, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
}
define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX7-LABEL: s_mul_i96:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s7, s1, s3
; GFX7-NEXT: s_mul_i32 s8, s0, s4
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_add_u32 s7, s7, s8
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s3
; GFX7-NEXT: v_mov_b32_e32 v3, s4
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s7, v0
; GFX7-NEXT: s_mul_i32 s2, s2, s3
; GFX7-NEXT: s_mul_i32 s7, s1, s4
; GFX7-NEXT: v_mul_hi_u32 v3, s0, v3
; GFX7-NEXT: s_mul_i32 s6, s0, s3
; GFX7-NEXT: s_cselect_b32 s8, 1, 0
; GFX7-NEXT: s_mul_i32 s5, s0, s5
; GFX7-NEXT: s_add_i32 s0, s2, s7
; GFX7-NEXT: s_add_i32 s0, s0, s5
; GFX7-NEXT: s_and_b32 s8, s8, 1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s8, v1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: s_mov_b32 s0, s6
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i96:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s7, s1, s3
; GFX8-NEXT: s_mul_i32 s8, s0, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_add_u32 s7, s7, s8
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s3
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s7, v0
; GFX8-NEXT: s_mul_i32 s2, s2, s3
; GFX8-NEXT: s_mul_i32 s7, s1, s4
; GFX8-NEXT: v_mul_hi_u32 v3, s0, v3
; GFX8-NEXT: s_mul_i32 s6, s0, s3
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
; GFX8-NEXT: s_mul_i32 s5, s0, s5
; GFX8-NEXT: s_add_i32 s0, s2, s7
; GFX8-NEXT: s_add_i32 s0, s0, s5
; GFX8-NEXT: s_and_b32 s8, s8, 1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s8, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: s_mov_b32 s0, s6
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i96:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s7, s1, s3
; GFX9-NEXT: s_mul_i32 s8, s0, s4
; GFX9-NEXT: s_add_u32 s7, s7, s8
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3
; GFX9-NEXT: s_and_b32 s8, s8, 1
; GFX9-NEXT: s_add_u32 s7, s7, s9
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
; GFX9-NEXT: s_and_b32 s9, s9, 1
; GFX9-NEXT: s_add_i32 s8, s8, s9
; GFX9-NEXT: s_mul_i32 s2, s2, s3
; GFX9-NEXT: s_mul_i32 s9, s1, s4
; GFX9-NEXT: s_mul_i32 s5, s0, s5
; GFX9-NEXT: s_add_i32 s2, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s2, s2, s5
; GFX9-NEXT: s_mul_i32 s6, s0, s3
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_add_i32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s2, s0, s8
; GFX9-NEXT: s_mov_b32 s0, s6
; GFX9-NEXT: s_mov_b32 s1, s7
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i96:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s6, s1, s3
; GFX10-NEXT: s_mul_i32 s7, s0, s4
; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3
; GFX10-NEXT: s_add_u32 s6, s6, s7
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_mul_i32 s2, s2, s3
; GFX10-NEXT: s_and_b32 s7, s7, 1
; GFX10-NEXT: s_mul_i32 s9, s1, s4
; GFX10-NEXT: s_add_u32 s6, s6, s8
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_mul_i32 s5, s0, s5
; GFX10-NEXT: s_add_i32 s2, s2, s9
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s3
; GFX10-NEXT: s_add_i32 s2, s2, s5
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s4
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_add_i32 s7, s7, s8
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: s_mul_i32 s0, s0, s3
; GFX10-NEXT: s_add_i32 s2, s1, s7
; GFX10-NEXT: s_mov_b32 s1, s6
; GFX10-NEXT: ; return to shader part epilog
%result = mul i96 %num, %den
%cast = bitcast i96 %result to <3 x i32>
ret <3 x i32> %cast
}
define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX7-LABEL: v_mul_i96:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX7-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX7-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX7-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX7-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX7-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v9
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v8
; GFX7-NEXT: v_mov_b32_e32 v0, v6
; GFX7-NEXT: v_mov_b32_e32 v1, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i96:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX8-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX8-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX8-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX8-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX8-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v8
; GFX8-NEXT: v_mov_b32_e32 v0, v6
; GFX8-NEXT: v_mov_b32_e32 v1, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i96:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX9-NEXT: v_mul_lo_u32 v10, v1, v4
; GFX9-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX9-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v2, v2, v10
; GFX9-NEXT: v_add_u32_e32 v3, v8, v9
; GFX9-NEXT: v_add3_u32 v1, v2, v5, v1
; GFX9-NEXT: v_add3_u32 v2, v1, v0, v3
; GFX9-NEXT: v_mov_b32_e32 v0, v6
; GFX9-NEXT: v_mov_b32_e32 v1, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v6, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v7, v0, v4
; GFX10-NEXT: v_mul_hi_u32 v8, v0, v3
; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX10-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX10-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3
; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v7
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v9
; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v2, v2, v5, v7
; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v6
; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
}
define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX7-LABEL: s_mul_i128:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s9, s1, s4
; GFX7-NEXT: s_mul_i32 s10, s0, s5
; GFX7-NEXT: s_add_u32 s9, s9, s10
; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s9, v0
; GFX7-NEXT: s_and_b32 s10, s10, 1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s10, v1
; GFX7-NEXT: s_mul_i32 s9, s2, s4
; GFX7-NEXT: s_mul_i32 s10, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_add_u32 s9, s9, s10
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: s_mul_i32 s11, s0, s6
; GFX7-NEXT: s_and_b32 s10, s10, 1
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: s_add_u32 s9, s9, s11
; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX7-NEXT: s_cselect_b32 s11, 1, 0
; GFX7-NEXT: s_and_b32 s11, s11, 1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2
; GFX7-NEXT: s_add_i32 s10, s10, s11
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s10, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: s_mul_i32 s3, s3, s4
; GFX7-NEXT: s_mul_i32 s5, s2, s5
; GFX7-NEXT: v_mul_hi_u32 v4, v4, s4
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: s_mul_i32 s8, s0, s4
; GFX7-NEXT: s_mul_i32 s9, s1, s6
; GFX7-NEXT: s_mul_i32 s7, s0, s7
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX7-NEXT: v_mul_hi_u32 v5, s0, v5
; GFX7-NEXT: s_add_i32 s0, s3, s5
; GFX7-NEXT: s_add_i32 s0, s0, s9
; GFX7-NEXT: s_add_i32 s0, s0, s7
; GFX7-NEXT: v_add_i32_e32 v4, vcc, s0, v4
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: v_readfirstlane_b32 s3, v2
; GFX7-NEXT: s_mov_b32 s0, s8
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s9, s1, s4
; GFX8-NEXT: s_mul_i32 s10, s0, s5
; GFX8-NEXT: s_add_u32 s9, s9, s10
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s9, v0
; GFX8-NEXT: s_and_b32 s10, s10, 1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v1
; GFX8-NEXT: s_mul_i32 s9, s2, s4
; GFX8-NEXT: s_mul_i32 s10, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_add_u32 s9, s9, s10
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: s_mul_i32 s11, s0, s6
; GFX8-NEXT: s_and_b32 s10, s10, 1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_add_u32 s9, s9, s11
; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX8-NEXT: s_cselect_b32 s11, 1, 0
; GFX8-NEXT: s_and_b32 s11, s11, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2
; GFX8-NEXT: s_add_i32 s10, s10, s11
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s10, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_mul_i32 s3, s3, s4
; GFX8-NEXT: s_mul_i32 s5, s2, s5
; GFX8-NEXT: v_mul_hi_u32 v4, v4, s4
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_mul_i32 s8, s0, s4
; GFX8-NEXT: s_mul_i32 s9, s1, s6
; GFX8-NEXT: s_mul_i32 s7, s0, s7
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v5
; GFX8-NEXT: s_add_i32 s0, s3, s5
; GFX8-NEXT: s_add_i32 s0, s0, s9
; GFX8-NEXT: s_add_i32 s0, s0, s7
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
; GFX8-NEXT: s_mov_b32 s0, s8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s9, s1, s4
; GFX9-NEXT: s_mul_i32 s10, s0, s5
; GFX9-NEXT: s_add_u32 s9, s9, s10
; GFX9-NEXT: s_cselect_b32 s10, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4
; GFX9-NEXT: s_and_b32 s10, s10, 1
; GFX9-NEXT: s_add_u32 s9, s9, s11
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: s_add_i32 s10, s10, s11
; GFX9-NEXT: s_mul_i32 s11, s2, s4
; GFX9-NEXT: s_mul_i32 s12, s1, s5
; GFX9-NEXT: s_add_u32 s11, s11, s12
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
; GFX9-NEXT: s_mul_i32 s13, s0, s6
; GFX9-NEXT: s_and_b32 s12, s12, 1
; GFX9-NEXT: s_add_u32 s11, s11, s13
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s11, s11, s14
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_mul_hi_u32 s15, s0, s5
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s11, s11, s15
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s10, s11, s10
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: s_add_i32 s12, s12, s11
; GFX9-NEXT: s_mul_i32 s3, s3, s4
; GFX9-NEXT: s_mul_i32 s11, s2, s5
; GFX9-NEXT: s_mul_i32 s13, s1, s6
; GFX9-NEXT: s_add_i32 s3, s3, s11
; GFX9-NEXT: s_mul_i32 s7, s0, s7
; GFX9-NEXT: s_add_i32 s3, s3, s13
; GFX9-NEXT: s_mul_hi_u32 s2, s2, s4
; GFX9-NEXT: s_add_i32 s3, s3, s7
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s5
; GFX9-NEXT: s_add_i32 s2, s3, s2
; GFX9-NEXT: s_mul_i32 s8, s0, s4
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s6
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_add_i32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s3, s0, s12
; GFX9-NEXT: s_mov_b32 s0, s8
; GFX9-NEXT: s_mov_b32 s1, s9
; GFX9-NEXT: s_mov_b32 s2, s10
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s8, s1, s4
; GFX10-NEXT: s_mul_i32 s9, s0, s5
; GFX10-NEXT: s_mul_hi_u32 s10, s0, s4
; GFX10-NEXT: s_add_u32 s8, s8, s9
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_mul_i32 s11, s1, s5
; GFX10-NEXT: s_and_b32 s9, s9, 1
; GFX10-NEXT: s_add_u32 s8, s8, s10
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_mul_i32 s12, s0, s6
; GFX10-NEXT: s_and_b32 s10, s10, 1
; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4
; GFX10-NEXT: s_add_i32 s9, s9, s10
; GFX10-NEXT: s_mul_i32 s10, s2, s4
; GFX10-NEXT: s_mul_i32 s3, s3, s4
; GFX10-NEXT: s_add_u32 s10, s10, s11
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_mul_i32 s7, s0, s7
; GFX10-NEXT: s_and_b32 s11, s11, 1
; GFX10-NEXT: s_add_u32 s10, s10, s12
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_add_u32 s10, s10, s13
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s13, s0, s5
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_add_u32 s10, s10, s13
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_mul_i32 s13, s1, s6
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s5
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_mul_i32 s12, s2, s5
; GFX10-NEXT: s_add_u32 s9, s10, s9
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_add_i32 s3, s3, s12
; GFX10-NEXT: s_mul_hi_u32 s2, s2, s4
; GFX10-NEXT: s_add_i32 s3, s3, s13
; GFX10-NEXT: s_and_b32 s10, s10, 1
; GFX10-NEXT: s_add_i32 s3, s3, s7
; GFX10-NEXT: s_add_i32 s11, s11, s10
; GFX10-NEXT: s_add_i32 s2, s3, s2
; GFX10-NEXT: s_mul_hi_u32 s3, s0, s6
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_mul_i32 s0, s0, s4
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_mov_b32 s2, s9
; GFX10-NEXT: s_add_i32 s3, s1, s11
; GFX10-NEXT: s_mov_b32 s1, s8
; GFX10-NEXT: ; return to shader part epilog
%result = mul i128 %num, %den
%cast = bitcast i128 %result to <4 x i32>
ret <4 x i32> %cast
}
define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-LABEL: v_mul_i128:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX7-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX7-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX7-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX7-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX7-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX7-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX7-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v14
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v15
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; GFX7-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX7-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX7-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX7-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX7-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v12
; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v13
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v0, v11
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX8-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX8-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX8-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX8-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v11
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX8-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX8-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX8-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v12
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v13
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v14
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v15
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v12, v11
; GFX8-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX8-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX8-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX8-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX8-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v12
; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v13
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v11
; GFX8-NEXT: v_mov_b32_e32 v0, v8
; GFX8-NEXT: v_mov_b32_e32 v1, v9
; GFX8-NEXT: v_mov_b32_e32 v2, v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX9-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX9-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v10
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v11
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v10, v10, v11
; GFX9-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX9-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX9-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v15
; GFX9-NEXT: v_add3_u32 v12, v12, v13, v14
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v11, v12, v13, v11
; GFX9-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX9-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX9-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX9-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX9-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX9-NEXT: v_add_u32_e32 v3, v3, v12
; GFX9-NEXT: v_add3_u32 v3, v3, v13, v7
; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX9-NEXT: v_add3_u32 v3, v1, v0, v11
; GFX9-NEXT: v_mov_b32_e32 v0, v8
; GFX9-NEXT: v_mov_b32_e32 v1, v9
; GFX9-NEXT: v_mov_b32_e32 v2, v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v8, v2, v4
; GFX10-NEXT: v_mul_lo_u32 v9, v1, v5
; GFX10-NEXT: v_mul_lo_u32 v10, v1, v4
; GFX10-NEXT: v_mul_lo_u32 v11, v0, v5
; GFX10-NEXT: v_mul_hi_u32 v12, v0, v4
; GFX10-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX10-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX10-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX10-NEXT: v_add_co_u32 v8, s4, v8, v9
; GFX10-NEXT: v_add_co_u32 v9, s5, v10, v11
; GFX10-NEXT: v_mul_hi_u32 v11, v1, v4
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v13, s4, v8, v13
; GFX10-NEXT: v_add_co_u32 v8, s5, v9, v12
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9
; GFX10-NEXT: v_mul_lo_u32 v10, v2, v5
; GFX10-NEXT: v_add_co_u32 v11, s4, v11, v15
; GFX10-NEXT: v_add3_u32 v12, v14, v12, v13
; GFX10-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v10
; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v0, v6
; GFX10-NEXT: v_add3_u32 v3, v3, v13, v7
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
; GFX10-NEXT: v_add3_u32 v4, v12, v14, v5
; GFX10-NEXT: v_add3_u32 v1, v3, v15, v1
; GFX10-NEXT: v_add3_u32 v3, v1, v6, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
}
define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-LABEL: s_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s16, s0
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX7-NEXT: s_mul_i32 s17, s1, s8
; GFX7-NEXT: s_mul_i32 s18, s16, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s17, v0
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s18, v1
; GFX7-NEXT: s_mul_i32 s17, s2, s8
; GFX7-NEXT: s_mul_i32 s18, s1, s9
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: s_mul_i32 s19, s16, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_mov_b32_e32 v3, s9
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_mul_hi_u32 v4, s16, v3
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s18, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX7-NEXT: s_mul_i32 s17, s3, s8
; GFX7-NEXT: s_mul_i32 s18, s2, s9
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: s_mul_i32 s19, s1, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_mul_i32 s20, s16, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5
; GFX7-NEXT: v_mov_b32_e32 v6, s10
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_mul_hi_u32 v7, s16, v6
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8
; GFX7-NEXT: s_mul_i32 s17, s4, s8
; GFX7-NEXT: s_mul_i32 s18, s3, s9
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GFX7-NEXT: s_mul_i32 s19, s2, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: s_mul_i32 s20, s1, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_mul_i32 s21, s16, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11
; GFX7-NEXT: s_mul_i32 s17, s5, s8
; GFX7-NEXT: s_mul_i32 s18, s4, s9
; GFX7-NEXT: v_mul_hi_u32 v8, s1, v6
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_mov_b32_e32 v9, s11
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX7-NEXT: s_mul_i32 s19, s3, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_mul_hi_u32 v10, s16, v9
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: s_mul_i32 s20, s2, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: s_mul_i32 s21, s1, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
; GFX7-NEXT: v_mov_b32_e32 v7, s4
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_mul_i32 s22, s16, s13
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s22
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8
; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14
; GFX7-NEXT: s_mul_i32 s17, s6, s8
; GFX7-NEXT: s_mul_i32 s18, s5, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_mul_hi_u32 v6, s2, v6
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_mul_i32 s19, s4, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_mul_hi_u32 v11, s1, v9
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mov_b32_e32 v12, s12
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: s_mul_i32 s20, s3, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_mul_hi_u32 v13, s16, v12
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_mul_i32 s21, s2, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v13
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GFX7-NEXT: s_mul_i32 s22, s1, s13
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s22
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GFX7-NEXT: v_mov_b32_e32 v8, s5
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_mul_i32 s23, s16, s14
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s23
; GFX7-NEXT: v_mul_hi_u32 v11, v7, s9
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX7-NEXT: v_mul_hi_u32 v5, v5, s10
; GFX7-NEXT: v_add_i32_e32 v17, vcc, s18, v17
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_mul_hi_u32 v13, s2, v9
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v17, v11
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: v_mul_hi_u32 v14, s1, v12
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v13
; GFX7-NEXT: v_mov_b32_e32 v15, s13
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_mul_hi_u32 v16, s16, v15
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v14
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v16
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: s_mul_i32 s7, s7, s8
; GFX7-NEXT: s_mul_i32 s17, s6, s9
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GFX7-NEXT: s_mul_i32 s5, s5, s10
; GFX7-NEXT: s_mul_i32 s4, s4, s11
; GFX7-NEXT: s_mul_i32 s11, s3, s12
; GFX7-NEXT: s_mul_i32 s12, s2, s13
; GFX7-NEXT: s_mul_i32 s13, s1, s14
; GFX7-NEXT: v_mul_hi_u32 v11, s2, v12
; GFX7-NEXT: v_mul_hi_u32 v12, s1, v15
; GFX7-NEXT: s_add_i32 s1, s7, s17
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX7-NEXT: s_add_i32 s1, s1, s5
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v10, v6
; GFX7-NEXT: v_mov_b32_e32 v10, s6
; GFX7-NEXT: s_add_i32 s1, s1, s4
; GFX7-NEXT: v_mul_hi_u32 v10, v10, s8
; GFX7-NEXT: s_add_i32 s1, s1, s11
; GFX7-NEXT: v_mul_hi_u32 v8, v8, s9
; GFX7-NEXT: s_add_i32 s1, s1, s12
; GFX7-NEXT: s_mul_i32 s15, s16, s15
; GFX7-NEXT: v_mul_hi_u32 v7, v7, s10
; GFX7-NEXT: s_add_i32 s1, s1, s13
; GFX7-NEXT: v_mul_hi_u32 v9, s3, v9
; GFX7-NEXT: s_add_i32 s1, s1, s15
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s1, v10
; GFX7-NEXT: v_mov_b32_e32 v13, s14
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GFX7-NEXT: v_mul_hi_u32 v13, s16, v13
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v11
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v12
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v13
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; GFX7-NEXT: s_mul_i32 s0, s0, s8
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: v_readfirstlane_b32 s3, v2
; GFX7-NEXT: v_readfirstlane_b32 s4, v3
; GFX7-NEXT: v_readfirstlane_b32 s5, v4
; GFX7-NEXT: v_readfirstlane_b32 s6, v5
; GFX7-NEXT: v_readfirstlane_b32 s7, v6
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s16, s0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
; GFX8-NEXT: s_mul_i32 s17, s1, s8
; GFX8-NEXT: s_mul_i32 s18, s16, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s17, v0
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s18, v1
; GFX8-NEXT: s_mul_i32 s17, s2, s8
; GFX8-NEXT: s_mul_i32 s18, s1, s9
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: s_mul_i32 s19, s16, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_mul_hi_u32 v4, s16, v3
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s18, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: s_mul_i32 s17, s3, s8
; GFX8-NEXT: s_mul_i32 s18, s2, s9
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: s_mul_i32 s19, s1, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_mul_i32 s20, s16, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5
; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_mul_hi_u32 v7, s16, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8
; GFX8-NEXT: s_mul_i32 s17, s4, s8
; GFX8-NEXT: s_mul_i32 s18, s3, s9
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v5
; GFX8-NEXT: s_mul_i32 s19, s2, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: s_mul_i32 s20, s1, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_mul_i32 s21, s16, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11
; GFX8-NEXT: s_mul_i32 s17, s5, s8
; GFX8-NEXT: s_mul_i32 s18, s4, s9
; GFX8-NEXT: v_mul_hi_u32 v8, s1, v6
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: s_mul_i32 s19, s3, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_mul_hi_u32 v10, s16, v9
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: s_mul_i32 s20, s2, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: s_mul_i32 s21, s1, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT: v_mov_b32_e32 v7, s4
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_mul_i32 s22, s16, s13
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s22
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8
; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14
; GFX8-NEXT: s_mul_i32 s17, s6, s8
; GFX8-NEXT: s_mul_i32 s18, s5, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_mul_hi_u32 v6, s2, v6
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_mul_i32 s19, s4, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_mul_hi_u32 v11, s1, v9
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mov_b32_e32 v12, s12
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: s_mul_i32 s20, s3, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_mul_hi_u32 v13, s16, v12
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_mul_i32 s21, s2, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
; GFX8-NEXT: s_mul_i32 s22, s1, s13
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s22
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT: v_mov_b32_e32 v8, s5
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_mul_i32 s23, s16, s14
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s23
; GFX8-NEXT: v_mul_hi_u32 v11, v7, s9
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX8-NEXT: v_mul_hi_u32 v5, v5, s10
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s18, v17
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_mul_hi_u32 v13, s2, v9
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v17, v11
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: v_mul_hi_u32 v14, s1, v12
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13
; GFX8-NEXT: v_mov_b32_e32 v15, s13
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_mul_hi_u32 v16, s16, v15
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v14
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: s_mul_i32 s7, s7, s8
; GFX8-NEXT: s_mul_i32 s17, s6, s9
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
; GFX8-NEXT: s_mul_i32 s5, s5, s10
; GFX8-NEXT: s_mul_i32 s4, s4, s11
; GFX8-NEXT: s_mul_i32 s11, s3, s12
; GFX8-NEXT: s_mul_i32 s12, s2, s13
; GFX8-NEXT: s_mul_i32 s13, s1, s14
; GFX8-NEXT: v_mul_hi_u32 v11, s2, v12
; GFX8-NEXT: v_mul_hi_u32 v12, s1, v15
; GFX8-NEXT: s_add_i32 s1, s7, s17
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: s_add_i32 s1, s1, s5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6
; GFX8-NEXT: v_mov_b32_e32 v10, s6
; GFX8-NEXT: s_add_i32 s1, s1, s4
; GFX8-NEXT: v_mul_hi_u32 v10, v10, s8
; GFX8-NEXT: s_add_i32 s1, s1, s11
; GFX8-NEXT: v_mul_hi_u32 v8, v8, s9
; GFX8-NEXT: s_add_i32 s1, s1, s12
; GFX8-NEXT: s_mul_i32 s15, s16, s15
; GFX8-NEXT: v_mul_hi_u32 v7, v7, s10
; GFX8-NEXT: s_add_i32 s1, s1, s13
; GFX8-NEXT: v_mul_hi_u32 v9, s3, v9
; GFX8-NEXT: s_add_i32 s1, s1, s15
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s1, v10
; GFX8-NEXT: v_mov_b32_e32 v13, s14
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; GFX8-NEXT: v_mul_hi_u32 v13, s16, v13
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v12
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v13
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
; GFX8-NEXT: s_mul_i32 s0, s0, s8
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
; GFX8-NEXT: v_readfirstlane_b32 s4, v3
; GFX8-NEXT: v_readfirstlane_b32 s5, v4
; GFX8-NEXT: v_readfirstlane_b32 s6, v5
; GFX8-NEXT: v_readfirstlane_b32 s7, v6
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s16, s0
; GFX9-NEXT: s_mul_i32 s17, s1, s8
; GFX9-NEXT: s_mul_i32 s18, s16, s9
; GFX9-NEXT: s_add_u32 s17, s17, s18
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s19, s16, s8
; GFX9-NEXT: s_and_b32 s18, s18, 1
; GFX9-NEXT: s_add_u32 s17, s17, s19
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: s_add_i32 s18, s18, s19
; GFX9-NEXT: s_mul_i32 s19, s2, s8
; GFX9-NEXT: s_mul_i32 s20, s1, s9
; GFX9-NEXT: s_add_u32 s19, s19, s20
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
; GFX9-NEXT: s_mul_i32 s21, s16, s10
; GFX9-NEXT: s_and_b32 s20, s20, 1
; GFX9-NEXT: s_add_u32 s19, s19, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_mul_hi_u32 s22, s1, s8
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s19, s19, s22
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s19, s19, s23
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s18, s19, s18
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: s_add_i32 s20, s20, s19
; GFX9-NEXT: s_mul_i32 s19, s3, s8
; GFX9-NEXT: s_mul_i32 s21, s2, s9
; GFX9-NEXT: s_add_u32 s19, s19, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_mul_i32 s22, s1, s10
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_u32 s19, s19, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_i32 s23, s16, s11
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s23
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s24, s2, s8
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s24
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s25, s1, s9
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s25
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s26, s16, s10
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s26
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s20
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
; GFX9-NEXT: s_and_b32 s20, s20, 1
; GFX9-NEXT: s_add_i32 s21, s21, s20
; GFX9-NEXT: s_mul_i32 s20, s4, s8
; GFX9-NEXT: s_mul_i32 s22, s3, s9
; GFX9-NEXT: s_add_u32 s20, s20, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_mul_i32 s23, s2, s10
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_u32 s20, s20, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_i32 s24, s1, s11
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s24
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_i32 s25, s16, s12
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s25
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s26, s3, s8
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s26
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s27, s2, s9
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s27
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s28, s1, s10
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s28
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s29, s16, s11
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s29
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_i32 s22, s22, s21
; GFX9-NEXT: s_mul_i32 s21, s5, s8
; GFX9-NEXT: s_mul_i32 s23, s4, s9
; GFX9-NEXT: s_add_u32 s21, s21, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_mul_i32 s24, s3, s10
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_u32 s21, s21, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s25, s2, s11
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s25
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s26, s1, s12
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s26
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s27, s16, s13
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s27
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s28, s4, s8
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s28
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s29
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s30, s2, s10
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s30
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s31, s1, s11
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s31
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s33, s16, s12
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s33
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_i32 s23, s23, s22
; GFX9-NEXT: s_mul_i32 s22, s6, s8
; GFX9-NEXT: s_mul_i32 s24, s5, s9
; GFX9-NEXT: s_add_u32 s22, s22, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_mul_i32 s25, s4, s10
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_add_u32 s22, s22, s25
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s26, s3, s11
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s26
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s27, s2, s12
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s27
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s28, s1, s13
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s28
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s29, s16, s14
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s29
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s30, s5, s8
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s30
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s31, s4, s9
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s31
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s33, s3, s10
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s33
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s34, s2, s11
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s34
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s35, s1, s12
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s35
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s36, s16, s13
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s36
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_i32 s24, s24, s23
; GFX9-NEXT: s_mul_i32 s7, s7, s8
; GFX9-NEXT: s_mul_i32 s23, s6, s9
; GFX9-NEXT: s_mul_i32 s25, s5, s10
; GFX9-NEXT: s_add_i32 s7, s7, s23
; GFX9-NEXT: s_mul_i32 s26, s4, s11
; GFX9-NEXT: s_add_i32 s7, s7, s25
; GFX9-NEXT: s_mul_i32 s27, s3, s12
; GFX9-NEXT: s_add_i32 s7, s7, s26
; GFX9-NEXT: s_mul_i32 s28, s2, s13
; GFX9-NEXT: s_add_i32 s7, s7, s27
; GFX9-NEXT: s_mul_i32 s29, s1, s14
; GFX9-NEXT: s_add_i32 s7, s7, s28
; GFX9-NEXT: s_mul_i32 s15, s16, s15
; GFX9-NEXT: s_add_i32 s7, s7, s29
; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
; GFX9-NEXT: s_add_i32 s7, s7, s15
; GFX9-NEXT: s_mul_hi_u32 s5, s5, s9
; GFX9-NEXT: s_add_i32 s6, s7, s6
; GFX9-NEXT: s_mul_hi_u32 s4, s4, s10
; GFX9-NEXT: s_add_i32 s5, s6, s5
; GFX9-NEXT: s_mul_hi_u32 s3, s3, s11
; GFX9-NEXT: s_add_i32 s4, s5, s4
; GFX9-NEXT: s_mul_hi_u32 s2, s2, s12
; GFX9-NEXT: s_add_i32 s3, s4, s3
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s13
; GFX9-NEXT: s_add_i32 s2, s3, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s8
; GFX9-NEXT: s_mul_hi_u32 s8, s16, s14
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_add_i32 s1, s1, s8
; GFX9-NEXT: s_add_i32 s7, s1, s24
; GFX9-NEXT: s_mov_b32 s1, s17
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s4, s20
; GFX9-NEXT: s_mov_b32 s5, s21
; GFX9-NEXT: s_mov_b32 s6, s22
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i256:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s16, s1, s8
; GFX10-NEXT: s_mul_i32 s17, s0, s9
; GFX10-NEXT: s_mul_hi_u32 s18, s0, s8
; GFX10-NEXT: s_add_u32 s16, s16, s17
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_mul_i32 s19, s1, s9
; GFX10-NEXT: s_and_b32 s17, s17, 1
; GFX10-NEXT: s_add_u32 s16, s16, s18
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: s_mul_i32 s20, s0, s10
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_mul_hi_u32 s21, s1, s8
; GFX10-NEXT: s_add_i32 s17, s17, s18
; GFX10-NEXT: s_mul_i32 s18, s2, s8
; GFX10-NEXT: s_mul_i32 s22, s0, s11
; GFX10-NEXT: s_add_u32 s18, s18, s19
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s1, s11
; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_add_u32 s18, s18, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s0, s12
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s25, s4, s9
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s21, s0, s9
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s26, s2, s11
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s21, s1, s10
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s27, s0, s13
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s17, s18, s17
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: s_mul_i32 s20, s2, s9
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9
; GFX10-NEXT: s_add_i32 s19, s19, s18
; GFX10-NEXT: s_mul_i32 s18, s3, s8
; GFX10-NEXT: s_mul_i32 s7, s7, s8
; GFX10-NEXT: s_add_u32 s18, s18, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s15, s0, s15
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s2, s8
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s1, s9
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s0, s10
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_i32 s22, s2, s10
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s19
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_mul_i32 s21, s3, s9
; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_add_i32 s20, s20, s19
; GFX10-NEXT: s_mul_i32 s19, s4, s8
; GFX10-NEXT: s_add_u32 s19, s19, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_u32 s19, s19, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s23, s3, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s24, s2, s9
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s23, s1, s10
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s24, s0, s11
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s5, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s3, s10
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s22, s1, s12
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_add_i32 s21, s21, s20
; GFX10-NEXT: s_add_u32 s23, s23, s25
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_u32 s23, s23, s24
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s24, s25, s24
; GFX10-NEXT: s_add_u32 s23, s23, s26
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s2, s10
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s22, s23, s22
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s25, s1, s11
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s23, s24, s23
; GFX10-NEXT: s_add_u32 s22, s22, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s0, s12
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s20, s22, s20
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s6, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s22, s23, s22
; GFX10-NEXT: s_add_u32 s20, s20, s28
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s28, s5, s9
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s26
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s26, s4, s10
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s25
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s25, s3, s11
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s27
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s27, s2, s12
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s1, s13
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s22, s22, s21
; GFX10-NEXT: s_add_u32 s21, s24, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s28, s0, s14
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s26, 1, 0
; GFX10-NEXT: s_and_b32 s26, s26, 1
; GFX10-NEXT: s_add_i32 s24, s24, s26
; GFX10-NEXT: s_add_u32 s21, s21, s25
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s5, s8
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s4, s9
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s21, s21, s23
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s25, s3, s10
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s23, s24, s23
; GFX10-NEXT: s_add_u32 s21, s21, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s28, s2, s11
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s1, s12
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s0, s13
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s25
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s25, s6, s9
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s6, s6, s8
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s26, s5, s10
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s5, s5, s9
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s27, s4, s11
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s4, s4, s10
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_add_i32 s7, s7, s25
; GFX10-NEXT: s_mul_i32 s24, s3, s12
; GFX10-NEXT: s_add_i32 s7, s7, s26
; GFX10-NEXT: s_mul_i32 s25, s2, s13
; GFX10-NEXT: s_add_i32 s7, s7, s27
; GFX10-NEXT: s_mul_i32 s26, s1, s14
; GFX10-NEXT: s_add_i32 s7, s7, s24
; GFX10-NEXT: s_mul_hi_u32 s3, s3, s11
; GFX10-NEXT: s_add_i32 s7, s7, s25
; GFX10-NEXT: s_mul_hi_u32 s2, s2, s12
; GFX10-NEXT: s_add_i32 s7, s7, s26
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s13
; GFX10-NEXT: s_add_i32 s7, s7, s15
; GFX10-NEXT: s_add_i32 s6, s7, s6
; GFX10-NEXT: s_add_i32 s5, s6, s5
; GFX10-NEXT: s_mov_b32 s6, s21
; GFX10-NEXT: s_add_i32 s4, s5, s4
; GFX10-NEXT: s_mov_b32 s5, s20
; GFX10-NEXT: s_add_i32 s3, s4, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s14
; GFX10-NEXT: s_add_i32 s2, s3, s2
; GFX10-NEXT: s_and_b32 s3, s22, 1
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_add_i32 s23, s23, s3
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: s_mul_i32 s0, s0, s8
; GFX10-NEXT: s_add_i32 s7, s1, s23
; GFX10-NEXT: s_mov_b32 s1, s16
; GFX10-NEXT: s_mov_b32 s2, s17
; GFX10-NEXT: s_mov_b32 s3, s18
; GFX10-NEXT: s_mov_b32 s4, s19
; GFX10-NEXT: ; return to shader part epilog
%result = mul i256 %num, %den
%cast = bitcast i256 %result to <8 x i32>
ret <8 x i32> %cast
}
define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX7-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX7-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX7-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX7-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v17
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v18
; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v17, vcc, v17, v18
; GFX7-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21
; GFX7-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_add_i32_e32 v17, vcc, v18, v17
; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v20, v3, v8
; GFX7-NEXT: v_mul_lo_u32 v21, v2, v9
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v2, v8
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v1, v9
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v0, v10
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v21, v4, v8
; GFX7-NEXT: v_mul_lo_u32 v22, v3, v9
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v0, v12
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v2, v9
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v0, v11
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX7-NEXT: v_mul_lo_u32 v23, v4, v9
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_mul_lo_u32 v21, v3, v10
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v23, v22
; GFX7-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX7-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX7-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX7-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v24, v23
; GFX7-NEXT: v_mul_lo_u32 v24, v3, v11
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v22
; GFX7-NEXT: v_mul_lo_u32 v22, v0, v8
; GFX7-NEXT: v_mul_hi_u32 v8, v6, v8
; GFX7-NEXT: v_mul_lo_u32 v6, v6, v9
; GFX7-NEXT: v_mul_hi_u32 v9, v5, v9
; GFX7-NEXT: v_mul_lo_u32 v5, v5, v10
; GFX7-NEXT: v_mul_hi_u32 v10, v4, v10
; GFX7-NEXT: v_mul_lo_u32 v4, v4, v11
; GFX7-NEXT: v_mul_hi_u32 v11, v3, v11
; GFX7-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX7-NEXT: v_mul_hi_u32 v12, v2, v12
; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; GFX7-NEXT: v_mul_hi_u32 v13, v1, v13
; GFX7-NEXT: v_mul_lo_u32 v1, v1, v14
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v15
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v8
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v9
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v10
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v12
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v13
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v0, v23
; GFX7-NEXT: v_mov_b32_e32 v0, v22
; GFX7-NEXT: v_mov_b32_e32 v1, v16
; GFX7-NEXT: v_mov_b32_e32 v2, v17
; GFX7-NEXT: v_mov_b32_e32 v3, v18
; GFX7-NEXT: v_mov_b32_e32 v4, v19
; GFX7-NEXT: v_mov_b32_e32 v5, v20
; GFX7-NEXT: v_mov_b32_e32 v6, v21
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX8-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX8-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX8-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX8-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v17
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v18
; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v18
; GFX8-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21
; GFX8-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v18, v17
; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v20, v3, v8
; GFX8-NEXT: v_mul_lo_u32 v21, v2, v9
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v2, v8
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v1, v9
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v0, v10
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v21, v4, v8
; GFX8-NEXT: v_mul_lo_u32 v22, v3, v9
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v0, v12
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v2, v9
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v0, v11
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX8-NEXT: v_mul_lo_u32 v23, v4, v9
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_mul_lo_u32 v21, v3, v10
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v23, v22
; GFX8-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX8-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX8-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v24, v23
; GFX8-NEXT: v_mul_lo_u32 v24, v3, v11
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22
; GFX8-NEXT: v_mul_lo_u32 v22, v0, v8
; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8
; GFX8-NEXT: v_mul_lo_u32 v6, v6, v9
; GFX8-NEXT: v_mul_hi_u32 v9, v5, v9
; GFX8-NEXT: v_mul_lo_u32 v5, v5, v10
; GFX8-NEXT: v_mul_hi_u32 v10, v4, v10
; GFX8-NEXT: v_mul_lo_u32 v4, v4, v11
; GFX8-NEXT: v_mul_hi_u32 v11, v3, v11
; GFX8-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX8-NEXT: v_mul_hi_u32 v12, v2, v12
; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
; GFX8-NEXT: v_mul_hi_u32 v13, v1, v13
; GFX8-NEXT: v_mul_lo_u32 v1, v1, v14
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v15
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v8
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v9
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v10
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v11
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v12
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v13
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v23
; GFX8-NEXT: v_mov_b32_e32 v0, v22
; GFX8-NEXT: v_mov_b32_e32 v1, v16
; GFX8-NEXT: v_mov_b32_e32 v2, v17
; GFX8-NEXT: v_mov_b32_e32 v3, v18
; GFX8-NEXT: v_mov_b32_e32 v4, v19
; GFX8-NEXT: v_mov_b32_e32 v5, v20
; GFX8-NEXT: v_mov_b32_e32 v6, v21
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v16, v2, v8
; GFX9-NEXT: v_mul_lo_u32 v17, v1, v9
; GFX9-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX9-NEXT: v_mul_hi_u32 v19, v1, v8
; GFX9-NEXT: v_mul_lo_u32 v20, v1, v8
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v17
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v18
; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v16, v19
; GFX9-NEXT: v_mul_lo_u32 v21, v0, v9
; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v18, v17, v18, v16
; GFX9-NEXT: v_mul_hi_u32 v16, v0, v8
; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v20, v21
; GFX9-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v17, v16
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v17, v20, v17
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21
; GFX9-NEXT: v_mul_lo_u32 v21, v3, v8
; GFX9-NEXT: v_mul_lo_u32 v22, v2, v9
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v19, v17
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v18, v18, v20, v19
; GFX9-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v22
; GFX9-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v22
; GFX9-NEXT: v_mul_hi_u32 v23, v2, v8
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v21, v20, v22
; GFX9-NEXT: v_mul_hi_u32 v21, v1, v9
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v0, v10
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v20, v22, v21
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
; GFX9-NEXT: v_mul_lo_u32 v22, v4, v8
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v9
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, v19, v18
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v19, v20, v21, v19
; GFX9-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v22, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v22, v0, v12
; GFX9-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23
; GFX9-NEXT: v_mul_hi_u32 v22, v2, v9
; GFX9-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23
; GFX9-NEXT: v_mul_hi_u32 v22, v0, v11
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v21, v22, v20
; GFX9-NEXT: v_mul_lo_u32 v21, v5, v8
; GFX9-NEXT: v_mul_lo_u32 v22, v4, v9
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v22, v23, v21
; GFX9-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX9-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v11
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v24, v25, v23
; GFX9-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v21
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v23, v24, v22
; GFX9-NEXT: v_mul_lo_u32 v23, v6, v9
; GFX9-NEXT: v_mul_lo_u32 v24, v4, v11
; GFX9-NEXT: v_mul_hi_u32 v4, v4, v10
; GFX9-NEXT: v_mul_hi_u32 v6, v6, v8
; GFX9-NEXT: v_add_u32_e32 v7, v7, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10
; GFX9-NEXT: v_mul_hi_u32 v5, v5, v9
; GFX9-NEXT: v_mul_hi_u32 v9, v3, v11
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX9-NEXT: v_mul_hi_u32 v10, v2, v12
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX9-NEXT: v_mul_hi_u32 v11, v1, v13
; GFX9-NEXT: v_mul_lo_u32 v12, v1, v14
; GFX9-NEXT: v_mul_lo_u32 v13, v0, v15
; GFX9-NEXT: v_add3_u32 v7, v7, v23, v24
; GFX9-NEXT: v_add3_u32 v2, v7, v3, v2
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v8
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX9-NEXT: v_add3_u32 v2, v2, v12, v13
; GFX9-NEXT: v_add3_u32 v2, v2, v6, v5
; GFX9-NEXT: v_add3_u32 v2, v2, v4, v9
; GFX9-NEXT: v_add3_u32 v2, v2, v10, v11
; GFX9-NEXT: v_add3_u32 v7, v2, v0, v22
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v16
; GFX9-NEXT: v_mov_b32_e32 v2, v17
; GFX9-NEXT: v_mov_b32_e32 v3, v18
; GFX9-NEXT: v_mov_b32_e32 v4, v19
; GFX9-NEXT: v_mov_b32_e32 v5, v20
; GFX9-NEXT: v_mov_b32_e32 v6, v21
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX10-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX10-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX10-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX10-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX10-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX10-NEXT: v_mul_lo_u32 v22, v3, v8
; GFX10-NEXT: v_mul_lo_u32 v25, v1, v10
; GFX10-NEXT: v_mul_hi_u32 v23, v0, v9
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v17
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10
; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v18
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s4, v19, v20
; GFX10-NEXT: v_mul_lo_u32 v20, v2, v9
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v17, v17, v18
; GFX10-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9
; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX10-NEXT: v_add_co_u32 v18, s4, v19, v18
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v22, v20
; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v21
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v25
; GFX10-NEXT: v_add3_u32 v19, v24, v19, v21
; GFX10-NEXT: v_mul_hi_u32 v21, v2, v8
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v23
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v22
; GFX10-NEXT: v_mul_hi_u32 v23, v1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v21
; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17
; GFX10-NEXT: v_add3_u32 v21, v26, v24, v25
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v23
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add3_u32 v18, v19, v22, v18
; GFX10-NEXT: v_add3_u32 v19, v21, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v21, v2, v10
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v11
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v26, v3, v8
; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v21
; GFX10-NEXT: v_mul_lo_u32 v22, v0, v12
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v18, s4, v20, v18
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v9
; GFX10-NEXT: v_add3_u32 v24, v25, v27, v24
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v26
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v19, v19, v23, v20
; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_add3_u32 v23, v24, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v9
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v10
; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v11
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v24
; GFX10-NEXT: v_add3_u32 v21, v23, v25, v21
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26
; GFX10-NEXT: v_mul_lo_u32 v26, v1, v12
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23
; GFX10-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26
; GFX10-NEXT: v_mul_hi_u32 v26, v4, v8
; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23
; GFX10-NEXT: v_add3_u32 v23, v24, v27, v28
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v20, v21, v25, v20
; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v26
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10
; GFX10-NEXT: v_add3_u32 v23, v23, v30, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v29
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11
; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8
; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_add3_u32 v23, v23, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v10
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v26
; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v25, v24
; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12
; GFX10-NEXT: v_add3_u32 v22, v23, v22, v26
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v12
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v24, v27
; GFX10-NEXT: v_mul_lo_u32 v27, v1, v13
; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v29
; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_mul_hi_u32 v27, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v24
; GFX10-NEXT: v_add3_u32 v24, v28, v25, v29
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_add3_u32 v24, v24, v30, v32
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v31
; GFX10-NEXT: v_add3_u32 v21, v22, v26, v21
; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11
; GFX10-NEXT: v_add3_u32 v22, v24, v28, v27
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v25
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9
; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12
; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v26
; GFX10-NEXT: v_add3_u32 v22, v22, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v11
; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v28
; GFX10-NEXT: v_mul_lo_u32 v28, v2, v13
; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8
; GFX10-NEXT: v_mul_hi_u32 v5, v5, v9
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_add3_u32 v7, v7, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v14
; GFX10-NEXT: v_mul_hi_u32 v25, v0, v13
; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10
; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11
; GFX10-NEXT: v_add3_u32 v7, v7, v29, v28
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v2, v2, v12
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v13
; GFX10-NEXT: v_add3_u32 v7, v7, v24, v15
; GFX10-NEXT: v_add_co_u32 v9, s4, v23, v25
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v22, v22, v26, v27
; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5
; GFX10-NEXT: v_add_co_u32 v6, s4, v9, v21
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v3, v5, v4, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v14
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v5, v22, v10, v7
; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v17
; GFX10-NEXT: v_mov_b32_e32 v3, v18
; GFX10-NEXT: v_add3_u32 v7, v1, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v16
; GFX10-NEXT: v_mov_b32_e32 v4, v19
; GFX10-NEXT: v_mov_b32_e32 v5, v20
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
}