Files
clang-p2996/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
Baptiste Saleil caf1294d95 [AMDGPU] Experiments show that the GCNRegBankReassign pass significantly impacts
the compilation time and there is no case for which we see any improvement in
performance. This patch removes this pass and its associated test cases from
the tree.

Differential Revision: https://reviews.llvm.org/D101313

Change-Id: I0599169a7609c19a887f8d847a71e664030cc141
2021-04-26 17:21:49 -04:00

2945 lines
118 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
; GFX7-LABEL: s_mul_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX7-LABEL: v_mul_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
; GFX7-LABEL: s_mul_i16_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_zeroext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX7-LABEL: v_mul_i16_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16_zeroext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
; GFX7-LABEL: s_mul_i16_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mul_i32 s0, s0, s1
; GFX7-NEXT: s_sext_i32_i16 s0, s0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s0, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_sext_i32_i16 s0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_signext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_sext_i32_i16 s0, s0
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
}
define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX7-LABEL: v_mul_i16_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i16_signext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
}
define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
; GCN-LABEL: s_mul_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mul_i32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
%result = mul i32 %num, %den
ret i32 %result
}
define i32 @v_mul_i32(i32 %num, i32 %den) {
; GCN-LABEL: v_mul_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i32 %num, %den
ret i32 %result
}
define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
; GCN-LABEL: s_mul_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_mul_i32 s0, s0, s2
; GCN-NEXT: s_mul_i32 s1, s1, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_mul_i32 s1, s1, s3
; GFX10-NEXT: ; return to shader part epilog
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GCN-LABEL: v_mul_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
; GCN-NEXT: v_mul_lo_u32 v1, v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul <2 x i32> %num, %den
ret <2 x i32> %result
}
define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX7-LABEL: s_mul_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s4, s0, s2
; GFX7-NEXT: s_mul_i32 s1, s1, s2
; GFX7-NEXT: s_mul_i32 s0, s0, s3
; GFX7-NEXT: s_add_i32 s1, s1, s0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s4, s0, s2
; GFX8-NEXT: s_mul_i32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s3
; GFX8-NEXT: s_add_i32 s1, s1, s0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s3, s0, s3
; GFX9-NEXT: s_mul_i32 s4, s0, s2
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s1, s1, s0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s3, s0, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: ; return to shader part epilog
%result = mul i64 %num, %den
ret i64 %result
}
define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX7-LABEL: v_mul_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v4, v0, v3
; GFX7-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX7-NEXT: v_mul_lo_u32 v3, v0, v2
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v4, v0, v3
; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX8-NEXT: v_mul_lo_u32 v3, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX9-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX9-NEXT: v_add3_u32 v1, v1, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
; GFX10-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v2
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10-NEXT: v_add3_u32 v1, v1, v3, v4
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
}
define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
; GFX7-LABEL: s_mul_i96:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_mul_i32 s7, s1, s3
; GFX7-NEXT: s_mul_i32 s8, s0, s4
; GFX7-NEXT: s_add_u32 s7, s7, s8
; GFX7-NEXT: v_mov_b32_e32 v3, s4
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s3
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s7, v0
; GFX7-NEXT: s_mul_i32 s7, s1, s4
; GFX7-NEXT: s_mul_i32 s2, s2, s3
; GFX7-NEXT: v_mul_hi_u32 v3, s0, v3
; GFX7-NEXT: s_cselect_b32 s8, 1, 0
; GFX7-NEXT: s_mul_i32 s6, s0, s3
; GFX7-NEXT: s_mul_i32 s5, s0, s5
; GFX7-NEXT: s_add_i32 s0, s2, s7
; GFX7-NEXT: s_add_i32 s0, s0, s5
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; GFX7-NEXT: s_and_b32 s8, s8, 1
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s8, v1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: s_mov_b32 s0, s6
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i96:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_mul_i32 s7, s1, s3
; GFX8-NEXT: s_mul_i32 s8, s0, s4
; GFX8-NEXT: s_add_u32 s7, s7, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s7, v0
; GFX8-NEXT: s_mul_i32 s7, s1, s4
; GFX8-NEXT: s_mul_i32 s2, s2, s3
; GFX8-NEXT: v_mul_hi_u32 v3, s0, v3
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
; GFX8-NEXT: s_mul_i32 s6, s0, s3
; GFX8-NEXT: s_mul_i32 s5, s0, s5
; GFX8-NEXT: s_add_i32 s0, s2, s7
; GFX8-NEXT: s_add_i32 s0, s0, s5
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT: s_and_b32 s8, s8, 1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s8, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: s_mov_b32 s0, s6
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i96:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s7, s1, s3
; GFX9-NEXT: s_mul_i32 s8, s0, s4
; GFX9-NEXT: s_add_u32 s7, s7, s8
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s9, s0, s3
; GFX9-NEXT: s_and_b32 s8, s8, 1
; GFX9-NEXT: s_add_u32 s7, s7, s9
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
; GFX9-NEXT: s_and_b32 s9, s9, 1
; GFX9-NEXT: s_add_i32 s8, s8, s9
; GFX9-NEXT: s_mul_i32 s9, s1, s4
; GFX9-NEXT: s_mul_i32 s2, s2, s3
; GFX9-NEXT: s_mul_i32 s5, s0, s5
; GFX9-NEXT: s_add_i32 s2, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s2, s2, s5
; GFX9-NEXT: s_mul_i32 s6, s0, s3
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_add_i32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s2, s0, s8
; GFX9-NEXT: s_mov_b32 s0, s6
; GFX9-NEXT: s_mov_b32 s1, s7
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i96:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s6, s1, s3
; GFX10-NEXT: s_mul_i32 s7, s0, s4
; GFX10-NEXT: s_mul_hi_u32 s8, s0, s3
; GFX10-NEXT: s_add_u32 s6, s6, s7
; GFX10-NEXT: s_cselect_b32 s7, 1, 0
; GFX10-NEXT: s_mul_i32 s9, s1, s4
; GFX10-NEXT: s_and_b32 s7, s7, 1
; GFX10-NEXT: s_mul_i32 s2, s2, s3
; GFX10-NEXT: s_add_u32 s6, s6, s8
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_mul_i32 s5, s0, s5
; GFX10-NEXT: s_add_i32 s2, s2, s9
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s3
; GFX10-NEXT: s_add_i32 s2, s2, s5
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s4
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_add_i32 s7, s7, s8
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: s_mul_i32 s0, s0, s3
; GFX10-NEXT: s_add_i32 s2, s1, s7
; GFX10-NEXT: s_mov_b32 s1, s6
; GFX10-NEXT: ; return to shader part epilog
%result = mul i96 %num, %den
%cast = bitcast i96 %result to <3 x i32>
ret <3 x i32> %cast
}
define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX7-LABEL: v_mul_i96:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX7-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX7-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX7-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX7-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX7-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v9
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v0, v8
; GFX7-NEXT: v_mov_b32_e32 v0, v6
; GFX7-NEXT: v_mov_b32_e32 v1, v7
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i96:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX8-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX8-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX8-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX8-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX8-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v8
; GFX8-NEXT: v_mov_b32_e32 v0, v6
; GFX8-NEXT: v_mov_b32_e32 v1, v7
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i96:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3
; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3
; GFX9-NEXT: v_mul_lo_u32 v10, v1, v4
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX9-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX9-NEXT: v_mul_hi_u32 v1, v1, v3
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v4
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v2, v2, v10
; GFX9-NEXT: v_add_u32_e32 v3, v8, v9
; GFX9-NEXT: v_add3_u32 v1, v2, v5, v1
; GFX9-NEXT: v_add3_u32 v2, v1, v0, v3
; GFX9-NEXT: v_mov_b32_e32 v0, v6
; GFX9-NEXT: v_mov_b32_e32 v1, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v6, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v7, v0, v4
; GFX10-NEXT: v_mul_hi_u32 v8, v0, v3
; GFX10-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX10-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3
; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v7
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v9
; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v2, v2, v5, v7
; GFX10-NEXT: v_add_nc_u32_e32 v3, v10, v6
; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
}
define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
; GFX7-LABEL: s_mul_i128:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s9, s1, s4
; GFX7-NEXT: s_mul_i32 s10, s0, s5
; GFX7-NEXT: s_add_u32 s9, s9, s10
; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s9, v0
; GFX7-NEXT: s_and_b32 s10, s10, 1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s10, v1
; GFX7-NEXT: s_mul_i32 s9, s2, s4
; GFX7-NEXT: s_mul_i32 s10, s1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_add_u32 s9, s9, s10
; GFX7-NEXT: s_cselect_b32 s10, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX7-NEXT: s_mul_i32 s11, s0, s6
; GFX7-NEXT: s_and_b32 s10, s10, 1
; GFX7-NEXT: s_add_u32 s9, s9, s11
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: s_cselect_b32 s11, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s9, v2
; GFX7-NEXT: s_and_b32 s11, s11, 1
; GFX7-NEXT: s_add_i32 s10, s10, s11
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s10, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: v_mov_b32_e32 v5, s6
; GFX7-NEXT: s_mul_i32 s5, s2, s5
; GFX7-NEXT: s_mul_i32 s3, s3, s4
; GFX7-NEXT: v_mul_hi_u32 v4, v4, s4
; GFX7-NEXT: s_mul_i32 s8, s0, s4
; GFX7-NEXT: s_mul_i32 s9, s1, s6
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX7-NEXT: s_mul_i32 s7, s0, s7
; GFX7-NEXT: v_mul_hi_u32 v5, s0, v5
; GFX7-NEXT: s_add_i32 s0, s3, s5
; GFX7-NEXT: s_add_i32 s0, s0, s9
; GFX7-NEXT: s_add_i32 s0, s0, s7
; GFX7-NEXT: v_add_i32_e32 v4, vcc, s0, v4
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: v_readfirstlane_b32 s3, v2
; GFX7-NEXT: s_mov_b32 s0, s8
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s9, s1, s4
; GFX8-NEXT: s_mul_i32 s10, s0, s5
; GFX8-NEXT: s_add_u32 s9, s9, s10
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s9, v0
; GFX8-NEXT: s_and_b32 s10, s10, 1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v1
; GFX8-NEXT: s_mul_i32 s9, s2, s4
; GFX8-NEXT: s_mul_i32 s10, s1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_add_u32 s9, s9, s10
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
; GFX8-NEXT: s_mul_i32 s11, s0, s6
; GFX8-NEXT: s_and_b32 s10, s10, 1
; GFX8-NEXT: s_add_u32 s9, s9, s11
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_cselect_b32 s11, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s9, v2
; GFX8-NEXT: s_and_b32 s11, s11, 1
; GFX8-NEXT: s_add_i32 s10, s10, s11
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s10, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_mul_i32 s5, s2, s5
; GFX8-NEXT: s_mul_i32 s3, s3, s4
; GFX8-NEXT: v_mul_hi_u32 v4, v4, s4
; GFX8-NEXT: s_mul_i32 s8, s0, s4
; GFX8-NEXT: s_mul_i32 s9, s1, s6
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX8-NEXT: s_mul_i32 s7, s0, s7
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v5
; GFX8-NEXT: s_add_i32 s0, s3, s5
; GFX8-NEXT: s_add_i32 s0, s0, s9
; GFX8-NEXT: s_add_i32 s0, s0, s7
; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
; GFX8-NEXT: s_mov_b32 s0, s8
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s9, s1, s4
; GFX9-NEXT: s_mul_i32 s10, s0, s5
; GFX9-NEXT: s_add_u32 s9, s9, s10
; GFX9-NEXT: s_cselect_b32 s10, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s11, s0, s4
; GFX9-NEXT: s_and_b32 s10, s10, 1
; GFX9-NEXT: s_add_u32 s9, s9, s11
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: s_add_i32 s10, s10, s11
; GFX9-NEXT: s_mul_i32 s11, s2, s4
; GFX9-NEXT: s_mul_i32 s12, s1, s5
; GFX9-NEXT: s_add_u32 s11, s11, s12
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
; GFX9-NEXT: s_mul_i32 s13, s0, s6
; GFX9-NEXT: s_and_b32 s12, s12, 1
; GFX9-NEXT: s_add_u32 s11, s11, s13
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s11, s11, s14
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_mul_hi_u32 s15, s0, s5
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s11, s11, s15
; GFX9-NEXT: s_cselect_b32 s13, 1, 0
; GFX9-NEXT: s_and_b32 s13, s13, 1
; GFX9-NEXT: s_add_i32 s12, s12, s13
; GFX9-NEXT: s_add_u32 s10, s11, s10
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
; GFX9-NEXT: s_and_b32 s11, s11, 1
; GFX9-NEXT: s_add_i32 s12, s12, s11
; GFX9-NEXT: s_mul_i32 s11, s2, s5
; GFX9-NEXT: s_mul_i32 s3, s3, s4
; GFX9-NEXT: s_mul_i32 s13, s1, s6
; GFX9-NEXT: s_add_i32 s3, s3, s11
; GFX9-NEXT: s_mul_i32 s7, s0, s7
; GFX9-NEXT: s_add_i32 s3, s3, s13
; GFX9-NEXT: s_mul_hi_u32 s2, s2, s4
; GFX9-NEXT: s_add_i32 s3, s3, s7
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s5
; GFX9-NEXT: s_add_i32 s2, s3, s2
; GFX9-NEXT: s_mul_i32 s8, s0, s4
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s6
; GFX9-NEXT: s_add_i32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s3, s0, s12
; GFX9-NEXT: s_mov_b32 s0, s8
; GFX9-NEXT: s_mov_b32 s1, s9
; GFX9-NEXT: s_mov_b32 s2, s10
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s8, s1, s4
; GFX10-NEXT: s_mul_i32 s9, s0, s5
; GFX10-NEXT: s_mul_hi_u32 s10, s0, s4
; GFX10-NEXT: s_add_u32 s8, s8, s9
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: s_mul_i32 s11, s1, s5
; GFX10-NEXT: s_and_b32 s9, s9, 1
; GFX10-NEXT: s_add_u32 s8, s8, s10
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_mul_i32 s12, s0, s6
; GFX10-NEXT: s_and_b32 s10, s10, 1
; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4
; GFX10-NEXT: s_add_i32 s9, s9, s10
; GFX10-NEXT: s_mul_i32 s10, s2, s4
; GFX10-NEXT: s_mul_i32 s3, s3, s4
; GFX10-NEXT: s_add_u32 s10, s10, s11
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
; GFX10-NEXT: s_mul_i32 s7, s0, s7
; GFX10-NEXT: s_and_b32 s11, s11, 1
; GFX10-NEXT: s_add_u32 s10, s10, s12
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_add_u32 s10, s10, s13
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s13, s0, s5
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_add_u32 s10, s10, s13
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_mul_i32 s13, s1, s6
; GFX10-NEXT: s_and_b32 s12, s12, 1
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s5
; GFX10-NEXT: s_add_i32 s11, s11, s12
; GFX10-NEXT: s_mul_i32 s12, s2, s5
; GFX10-NEXT: s_add_u32 s9, s10, s9
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_add_i32 s3, s3, s12
; GFX10-NEXT: s_mul_hi_u32 s2, s2, s4
; GFX10-NEXT: s_add_i32 s3, s3, s13
; GFX10-NEXT: s_and_b32 s10, s10, 1
; GFX10-NEXT: s_add_i32 s3, s3, s7
; GFX10-NEXT: s_add_i32 s11, s11, s10
; GFX10-NEXT: s_add_i32 s2, s3, s2
; GFX10-NEXT: s_mul_hi_u32 s3, s0, s6
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_mul_i32 s0, s0, s4
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_mov_b32 s2, s9
; GFX10-NEXT: s_add_i32 s3, s1, s11
; GFX10-NEXT: s_mov_b32 s1, s8
; GFX10-NEXT: ; return to shader part epilog
%result = mul i128 %num, %den
%cast = bitcast i128 %result to <4 x i32>
ret <4 x i32> %cast
}
define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-LABEL: v_mul_i128:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX7-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX7-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX7-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX7-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX7-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX7-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX7-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v14
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v11, v15
; GFX7-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; GFX7-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX7-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX7-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX7-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v12
; GFX7-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v13
; GFX7-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v0, v11
; GFX7-NEXT: v_mov_b32_e32 v0, v8
; GFX7-NEXT: v_mov_b32_e32 v1, v9
; GFX7-NEXT: v_mov_b32_e32 v2, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX8-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX8-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX8-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX8-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v11
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX8-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX8-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX8-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v12
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v13
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v14
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v15
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v12, v11
; GFX8-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX8-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX8-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX8-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v12
; GFX8-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v13
; GFX8-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v11
; GFX8-NEXT: v_mov_b32_e32 v0, v8
; GFX8-NEXT: v_mov_b32_e32 v1, v9
; GFX8-NEXT: v_mov_b32_e32 v2, v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5
; GFX9-NEXT: v_mul_hi_u32 v11, v0, v4
; GFX9-NEXT: v_mul_lo_u32 v12, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v10
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v11
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v10, v10, v11
; GFX9-NEXT: v_mul_lo_u32 v11, v2, v4
; GFX9-NEXT: v_mul_hi_u32 v14, v1, v4
; GFX9-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v12
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v14
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v15
; GFX9-NEXT: v_add3_u32 v12, v12, v13, v14
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v11, v12, v13, v11
; GFX9-NEXT: v_mul_lo_u32 v12, v2, v5
; GFX9-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX9-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX9-NEXT: v_mul_hi_u32 v2, v2, v4
; GFX9-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v8, v0, v4
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v6
; GFX9-NEXT: v_add_u32_e32 v3, v3, v12
; GFX9-NEXT: v_add3_u32 v3, v3, v13, v7
; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX9-NEXT: v_add3_u32 v3, v1, v0, v11
; GFX9-NEXT: v_mov_b32_e32 v0, v8
; GFX9-NEXT: v_mov_b32_e32 v1, v9
; GFX9-NEXT: v_mov_b32_e32 v2, v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v8, v2, v4
; GFX10-NEXT: v_mul_lo_u32 v9, v1, v5
; GFX10-NEXT: v_mul_lo_u32 v10, v1, v4
; GFX10-NEXT: v_mul_lo_u32 v11, v0, v5
; GFX10-NEXT: v_mul_hi_u32 v12, v0, v4
; GFX10-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX10-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX10-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX10-NEXT: v_add_co_u32 v8, s4, v8, v9
; GFX10-NEXT: v_add_co_u32 v9, s5, v10, v11
; GFX10-NEXT: v_mul_hi_u32 v11, v1, v4
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v13, s4, v8, v13
; GFX10-NEXT: v_add_co_u32 v8, s5, v9, v12
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v11, s4, v13, v11
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9
; GFX10-NEXT: v_mul_lo_u32 v10, v2, v5
; GFX10-NEXT: v_add_co_u32 v11, s4, v11, v15
; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4
; GFX10-NEXT: v_add3_u32 v12, v14, v12, v13
; GFX10-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9
; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v0, v6
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
; GFX10-NEXT: v_add3_u32 v3, v3, v13, v7
; GFX10-NEXT: v_add3_u32 v4, v12, v14, v5
; GFX10-NEXT: v_add3_u32 v1, v3, v15, v1
; GFX10-NEXT: v_add3_u32 v3, v1, v6, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
}
define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
; GFX7-LABEL: s_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s17, s1, s8
; GFX7-NEXT: s_mul_i32 s18, s0, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s17, v0
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s18, v1
; GFX7-NEXT: s_mul_i32 s17, s2, s8
; GFX7-NEXT: s_mul_i32 s18, s1, s9
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s8
; GFX7-NEXT: s_mul_i32 s19, s0, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_mov_b32_e32 v3, s9
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX7-NEXT: v_add_i32_e32 v2, vcc, s17, v2
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s18, v5
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GFX7-NEXT: s_mul_i32 s17, s3, s8
; GFX7-NEXT: s_mul_i32 s18, s2, s9
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: s_mul_i32 s19, s1, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v4, s2
; GFX7-NEXT: v_mul_hi_u32 v5, v4, s8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_mul_i32 s20, s0, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX7-NEXT: v_add_i32_e32 v5, vcc, s17, v5
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mov_b32_e32 v6, s10
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s18, v8
; GFX7-NEXT: v_mul_hi_u32 v7, s0, v6
; GFX7-NEXT: s_mul_i32 s17, s4, s8
; GFX7-NEXT: s_mul_i32 s18, s3, s9
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GFX7-NEXT: s_mul_i32 s19, s2, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: s_mul_i32 s20, s1, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GFX7-NEXT: v_mov_b32_e32 v5, s3
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mul_hi_u32 v7, v5, s8
; GFX7-NEXT: s_mul_i32 s21, s0, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v7, vcc, s17, v7
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mul_hi_u32 v4, v4, s9
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v11, vcc, s18, v11
; GFX7-NEXT: s_mul_i32 s17, s5, s8
; GFX7-NEXT: s_mul_i32 s18, s4, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: v_mul_hi_u32 v8, s1, v6
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
; GFX7-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX7-NEXT: v_mov_b32_e32 v9, s11
; GFX7-NEXT: s_mul_i32 s19, s3, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: v_mul_hi_u32 v10, s0, v9
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: s_mul_i32 s20, s2, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: s_mul_i32 s21, s1, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v7, v4
; GFX7-NEXT: v_mov_b32_e32 v7, s4
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mul_hi_u32 v8, v7, s8
; GFX7-NEXT: s_mul_i32 s22, s0, s13
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s22
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v8, vcc, s17, v8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v14, vcc, s18, v14
; GFX7-NEXT: s_mul_i32 s17, s6, s8
; GFX7-NEXT: s_mul_i32 s18, s5, s9
; GFX7-NEXT: s_add_u32 s17, s17, s18
; GFX7-NEXT: s_cselect_b32 s18, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v6, s2, v6
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_mul_i32 s19, s4, s10
; GFX7-NEXT: s_and_b32 s18, s18, 1
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s19
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v14, v10
; GFX7-NEXT: v_mul_hi_u32 v11, s1, v9
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX7-NEXT: v_mov_b32_e32 v12, s12
; GFX7-NEXT: s_mul_i32 s20, s3, s11
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GFX7-NEXT: s_add_u32 s17, s17, s20
; GFX7-NEXT: v_mul_hi_u32 v13, s0, v12
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v11
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_mul_i32 s21, s2, s12
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v6, v13
; GFX7-NEXT: s_add_u32 s17, s17, s21
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GFX7-NEXT: s_mul_i32 s22, s1, s13
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX7-NEXT: s_add_u32 s17, s17, s22
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GFX7-NEXT: v_mov_b32_e32 v8, s5
; GFX7-NEXT: v_mul_hi_u32 v10, v8, s8
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_mul_i32 s23, s0, s14
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: s_add_u32 s17, s17, s23
; GFX7-NEXT: s_cselect_b32 s19, 1, 0
; GFX7-NEXT: v_mul_hi_u32 v11, v7, s9
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s17, v10
; GFX7-NEXT: s_and_b32 s19, s19, 1
; GFX7-NEXT: s_add_i32 s18, s18, s19
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v17, vcc, s18, v17
; GFX7-NEXT: v_mul_hi_u32 v5, v5, s10
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_mul_hi_u32 v13, s2, v9
; GFX7-NEXT: v_add_i32_e32 v11, vcc, v17, v11
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GFX7-NEXT: v_mul_hi_u32 v14, s1, v12
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v13
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_mov_b32_e32 v15, s13
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_mul_hi_u32 v16, s0, v15
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v14
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v16
; GFX7-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GFX7-NEXT: v_mov_b32_e32 v13, s14
; GFX7-NEXT: s_mul_i32 s7, s7, s8
; GFX7-NEXT: s_mul_i32 s17, s6, s9
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GFX7-NEXT: s_mul_i32 s16, s0, s8
; GFX7-NEXT: s_mul_i32 s5, s5, s10
; GFX7-NEXT: s_mul_i32 s15, s0, s15
; GFX7-NEXT: v_mul_hi_u32 v13, s0, v13
; GFX7-NEXT: s_add_i32 s0, s7, s17
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX7-NEXT: s_mul_i32 s4, s4, s11
; GFX7-NEXT: s_add_i32 s0, s0, s5
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v10, v6
; GFX7-NEXT: v_mov_b32_e32 v10, s6
; GFX7-NEXT: s_mul_i32 s11, s3, s12
; GFX7-NEXT: s_add_i32 s0, s0, s4
; GFX7-NEXT: s_mul_i32 s12, s2, s13
; GFX7-NEXT: s_add_i32 s0, s0, s11
; GFX7-NEXT: v_mul_hi_u32 v10, v10, s8
; GFX7-NEXT: s_mul_i32 s13, s1, s14
; GFX7-NEXT: s_add_i32 s0, s0, s12
; GFX7-NEXT: v_mul_hi_u32 v8, v8, s9
; GFX7-NEXT: s_add_i32 s0, s0, s13
; GFX7-NEXT: v_mul_hi_u32 v7, v7, s10
; GFX7-NEXT: v_mul_hi_u32 v9, s3, v9
; GFX7-NEXT: s_add_i32 s0, s0, s15
; GFX7-NEXT: v_mul_hi_u32 v11, s2, v12
; GFX7-NEXT: v_add_i32_e32 v10, vcc, s0, v10
; GFX7-NEXT: v_mul_hi_u32 v12, s1, v15
; GFX7-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v11
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v12
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v7, v13
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s2, v1
; GFX7-NEXT: v_readfirstlane_b32 s3, v2
; GFX7-NEXT: v_readfirstlane_b32 s4, v3
; GFX7-NEXT: v_readfirstlane_b32 s5, v4
; GFX7-NEXT: v_readfirstlane_b32 s6, v5
; GFX7-NEXT: v_readfirstlane_b32 s7, v6
; GFX7-NEXT: s_mov_b32 s0, s16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s17, s1, s8
; GFX8-NEXT: s_mul_i32 s18, s0, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s17, v0
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, s18, v1
; GFX8-NEXT: s_mul_i32 s17, s2, s8
; GFX8-NEXT: s_mul_i32 s18, s1, s9
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s8
; GFX8-NEXT: s_mul_i32 s19, s0, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_mov_b32_e32 v3, s9
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v4, s0, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s17, v2
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s18, v5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: s_mul_i32 s17, s3, s8
; GFX8-NEXT: s_mul_i32 s18, s2, s9
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: s_mul_i32 s19, s1, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mul_hi_u32 v5, v4, s8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_mul_i32 s20, s0, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s17, v5
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s18, v8
; GFX8-NEXT: v_mul_hi_u32 v7, s0, v6
; GFX8-NEXT: s_mul_i32 s17, s4, s8
; GFX8-NEXT: s_mul_i32 s18, s3, s9
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v5
; GFX8-NEXT: s_mul_i32 s19, s2, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v7
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: s_mul_i32 s20, s1, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mul_hi_u32 v7, v5, s8
; GFX8-NEXT: s_mul_i32 s21, s0, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s17, v7
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mul_hi_u32 v4, v4, s9
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v11, vcc, s18, v11
; GFX8-NEXT: s_mul_i32 s17, s5, s8
; GFX8-NEXT: s_mul_i32 s18, s4, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: v_mul_hi_u32 v8, s1, v6
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: s_mul_i32 s19, s3, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v11, v7
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: v_mul_hi_u32 v10, s0, v9
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: s_mul_i32 s20, s2, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: s_mul_i32 s21, s1, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
; GFX8-NEXT: v_mov_b32_e32 v7, s4
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mul_hi_u32 v8, v7, s8
; GFX8-NEXT: s_mul_i32 s22, s0, s13
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s22
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s17, v8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_mul_hi_u32 v10, v5, s9
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v14, vcc, s18, v14
; GFX8-NEXT: s_mul_i32 s17, s6, s8
; GFX8-NEXT: s_mul_i32 s18, s5, s9
; GFX8-NEXT: s_add_u32 s17, s17, s18
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v6, s2, v6
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_mul_i32 s19, s4, s10
; GFX8-NEXT: s_and_b32 s18, s18, 1
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s19
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v14, v10
; GFX8-NEXT: v_mul_hi_u32 v11, s1, v9
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_mov_b32_e32 v12, s12
; GFX8-NEXT: s_mul_i32 s20, s3, s11
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; GFX8-NEXT: s_add_u32 s17, s17, s20
; GFX8-NEXT: v_mul_hi_u32 v13, s0, v12
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_mul_i32 s21, s2, s12
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13
; GFX8-NEXT: s_add_u32 s17, s17, s21
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
; GFX8-NEXT: s_mul_i32 s22, s1, s13
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: s_add_u32 s17, s17, s22
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
; GFX8-NEXT: v_mov_b32_e32 v8, s5
; GFX8-NEXT: v_mul_hi_u32 v10, v8, s8
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_mul_i32 s23, s0, s14
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: s_add_u32 s17, s17, s23
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
; GFX8-NEXT: v_mul_hi_u32 v11, v7, s9
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s17, v10
; GFX8-NEXT: s_and_b32 s19, s19, 1
; GFX8-NEXT: s_add_i32 s18, s18, s19
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, s18, v17
; GFX8-NEXT: v_mul_hi_u32 v5, v5, s10
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_mul_hi_u32 v13, s2, v9
; GFX8-NEXT: v_add_u32_e32 v11, vcc, v17, v11
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v10, v5
; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v11, v10
; GFX8-NEXT: v_mul_hi_u32 v14, s1, v12
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v13
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_mov_b32_e32 v15, s13
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_mul_hi_u32 v16, s0, v15
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v14
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v16
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; GFX8-NEXT: v_mov_b32_e32 v13, s14
; GFX8-NEXT: s_mul_i32 s7, s7, s8
; GFX8-NEXT: s_mul_i32 s17, s6, s9
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
; GFX8-NEXT: s_mul_i32 s16, s0, s8
; GFX8-NEXT: s_mul_i32 s5, s5, s10
; GFX8-NEXT: s_mul_i32 s15, s0, s15
; GFX8-NEXT: v_mul_hi_u32 v13, s0, v13
; GFX8-NEXT: s_add_i32 s0, s7, s17
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: s_mul_i32 s4, s4, s11
; GFX8-NEXT: s_add_i32 s0, s0, s5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v10, v6
; GFX8-NEXT: v_mov_b32_e32 v10, s6
; GFX8-NEXT: s_mul_i32 s11, s3, s12
; GFX8-NEXT: s_add_i32 s0, s0, s4
; GFX8-NEXT: s_mul_i32 s12, s2, s13
; GFX8-NEXT: s_add_i32 s0, s0, s11
; GFX8-NEXT: v_mul_hi_u32 v10, v10, s8
; GFX8-NEXT: s_mul_i32 s13, s1, s14
; GFX8-NEXT: s_add_i32 s0, s0, s12
; GFX8-NEXT: v_mul_hi_u32 v8, v8, s9
; GFX8-NEXT: s_add_i32 s0, s0, s13
; GFX8-NEXT: v_mul_hi_u32 v7, v7, s10
; GFX8-NEXT: v_mul_hi_u32 v9, s3, v9
; GFX8-NEXT: s_add_i32 s0, s0, s15
; GFX8-NEXT: v_mul_hi_u32 v11, s2, v12
; GFX8-NEXT: v_add_u32_e32 v10, vcc, s0, v10
; GFX8-NEXT: v_mul_hi_u32 v12, s1, v15
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v10, v8
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v12
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v13
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
; GFX8-NEXT: v_readfirstlane_b32 s3, v2
; GFX8-NEXT: v_readfirstlane_b32 s4, v3
; GFX8-NEXT: v_readfirstlane_b32 s5, v4
; GFX8-NEXT: v_readfirstlane_b32 s6, v5
; GFX8-NEXT: v_readfirstlane_b32 s7, v6
; GFX8-NEXT: s_mov_b32 s0, s16
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s17, s1, s8
; GFX9-NEXT: s_mul_i32 s18, s0, s9
; GFX9-NEXT: s_add_u32 s17, s17, s18
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
; GFX9-NEXT: s_mul_hi_u32 s19, s0, s8
; GFX9-NEXT: s_and_b32 s18, s18, 1
; GFX9-NEXT: s_add_u32 s17, s17, s19
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: s_add_i32 s18, s18, s19
; GFX9-NEXT: s_mul_i32 s19, s2, s8
; GFX9-NEXT: s_mul_i32 s20, s1, s9
; GFX9-NEXT: s_add_u32 s19, s19, s20
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
; GFX9-NEXT: s_mul_i32 s21, s0, s10
; GFX9-NEXT: s_and_b32 s20, s20, 1
; GFX9-NEXT: s_add_u32 s19, s19, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_mul_hi_u32 s22, s1, s8
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s19, s19, s22
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_mul_hi_u32 s23, s0, s9
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s19, s19, s23
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_i32 s20, s20, s21
; GFX9-NEXT: s_add_u32 s18, s19, s18
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
; GFX9-NEXT: s_and_b32 s19, s19, 1
; GFX9-NEXT: s_add_i32 s20, s20, s19
; GFX9-NEXT: s_mul_i32 s19, s3, s8
; GFX9-NEXT: s_mul_i32 s21, s2, s9
; GFX9-NEXT: s_add_u32 s19, s19, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_mul_i32 s22, s1, s10
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_u32 s19, s19, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_i32 s23, s0, s11
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s23
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s24, s2, s8
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s24
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s25, s1, s9
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s25
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_mul_hi_u32 s26, s0, s10
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s26
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_i32 s21, s21, s22
; GFX9-NEXT: s_add_u32 s19, s19, s20
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
; GFX9-NEXT: s_and_b32 s20, s20, 1
; GFX9-NEXT: s_add_i32 s21, s21, s20
; GFX9-NEXT: s_mul_i32 s20, s4, s8
; GFX9-NEXT: s_mul_i32 s22, s3, s9
; GFX9-NEXT: s_add_u32 s20, s20, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_mul_i32 s23, s2, s10
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_u32 s20, s20, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_i32 s24, s1, s11
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s24
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_i32 s25, s0, s12
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s25
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s26, s3, s8
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s26
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s27, s2, s9
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s27
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s28, s1, s10
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s28
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_mul_hi_u32 s29, s0, s11
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s29
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_i32 s22, s22, s23
; GFX9-NEXT: s_add_u32 s20, s20, s21
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
; GFX9-NEXT: s_and_b32 s21, s21, 1
; GFX9-NEXT: s_add_i32 s22, s22, s21
; GFX9-NEXT: s_mul_i32 s21, s5, s8
; GFX9-NEXT: s_mul_i32 s23, s4, s9
; GFX9-NEXT: s_add_u32 s21, s21, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_mul_i32 s24, s3, s10
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_u32 s21, s21, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s25, s2, s11
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s25
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s26, s1, s12
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s26
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_i32 s27, s0, s13
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s27
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s28, s4, s8
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s28
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s29
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s30, s2, s10
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s30
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s31, s1, s11
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s31
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_mul_hi_u32 s33, s0, s12
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s33
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_add_i32 s23, s23, s24
; GFX9-NEXT: s_add_u32 s21, s21, s22
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
; GFX9-NEXT: s_and_b32 s22, s22, 1
; GFX9-NEXT: s_add_i32 s23, s23, s22
; GFX9-NEXT: s_mul_i32 s22, s6, s8
; GFX9-NEXT: s_mul_i32 s24, s5, s9
; GFX9-NEXT: s_add_u32 s22, s22, s24
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
; GFX9-NEXT: s_mul_i32 s25, s4, s10
; GFX9-NEXT: s_and_b32 s24, s24, 1
; GFX9-NEXT: s_add_u32 s22, s22, s25
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s26, s3, s11
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s26
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s27, s2, s12
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s27
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s28, s1, s13
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s28
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_i32 s29, s0, s14
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s29
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s30, s5, s8
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s30
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s31, s4, s9
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s31
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s33, s3, s10
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s33
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s34, s2, s11
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s34
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s35, s1, s12
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s35
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_mul_hi_u32 s36, s0, s13
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s36
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
; GFX9-NEXT: s_and_b32 s25, s25, 1
; GFX9-NEXT: s_add_i32 s24, s24, s25
; GFX9-NEXT: s_add_u32 s22, s22, s23
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_add_i32 s24, s24, s23
; GFX9-NEXT: s_mul_i32 s23, s6, s9
; GFX9-NEXT: s_mul_i32 s7, s7, s8
; GFX9-NEXT: s_mul_i32 s25, s5, s10
; GFX9-NEXT: s_add_i32 s7, s7, s23
; GFX9-NEXT: s_mul_i32 s26, s4, s11
; GFX9-NEXT: s_add_i32 s7, s7, s25
; GFX9-NEXT: s_mul_i32 s27, s3, s12
; GFX9-NEXT: s_add_i32 s7, s7, s26
; GFX9-NEXT: s_mul_i32 s28, s2, s13
; GFX9-NEXT: s_add_i32 s7, s7, s27
; GFX9-NEXT: s_mul_i32 s29, s1, s14
; GFX9-NEXT: s_add_i32 s7, s7, s28
; GFX9-NEXT: s_mul_i32 s15, s0, s15
; GFX9-NEXT: s_add_i32 s7, s7, s29
; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
; GFX9-NEXT: s_add_i32 s7, s7, s15
; GFX9-NEXT: s_mul_hi_u32 s5, s5, s9
; GFX9-NEXT: s_add_i32 s6, s7, s6
; GFX9-NEXT: s_add_i32 s5, s6, s5
; GFX9-NEXT: s_mul_hi_u32 s4, s4, s10
; GFX9-NEXT: s_add_i32 s4, s5, s4
; GFX9-NEXT: s_mul_hi_u32 s3, s3, s11
; GFX9-NEXT: s_add_i32 s3, s4, s3
; GFX9-NEXT: s_mul_hi_u32 s2, s2, s12
; GFX9-NEXT: s_add_i32 s2, s3, s2
; GFX9-NEXT: s_mul_hi_u32 s1, s1, s13
; GFX9-NEXT: s_mul_i32 s16, s0, s8
; GFX9-NEXT: s_add_i32 s1, s2, s1
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s14
; GFX9-NEXT: s_add_i32 s0, s1, s0
; GFX9-NEXT: s_add_i32 s7, s0, s24
; GFX9-NEXT: s_mov_b32 s0, s16
; GFX9-NEXT: s_mov_b32 s1, s17
; GFX9-NEXT: s_mov_b32 s2, s18
; GFX9-NEXT: s_mov_b32 s3, s19
; GFX9-NEXT: s_mov_b32 s4, s20
; GFX9-NEXT: s_mov_b32 s5, s21
; GFX9-NEXT: s_mov_b32 s6, s22
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i256:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s16, s1, s8
; GFX10-NEXT: s_mul_i32 s17, s0, s9
; GFX10-NEXT: s_mul_hi_u32 s18, s0, s8
; GFX10-NEXT: s_add_u32 s16, s16, s17
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_mul_i32 s19, s1, s9
; GFX10-NEXT: s_and_b32 s17, s17, 1
; GFX10-NEXT: s_add_u32 s16, s16, s18
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: s_mul_i32 s20, s0, s10
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_mul_hi_u32 s21, s1, s8
; GFX10-NEXT: s_add_i32 s17, s17, s18
; GFX10-NEXT: s_mul_i32 s18, s2, s8
; GFX10-NEXT: s_mul_i32 s22, s0, s11
; GFX10-NEXT: s_add_u32 s18, s18, s19
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s1, s11
; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_add_u32 s18, s18, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s0, s12
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s25, s4, s9
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s21, s0, s9
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s26, s2, s11
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s21, s1, s10
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_mul_i32 s27, s0, s13
; GFX10-NEXT: s_add_i32 s19, s19, s20
; GFX10-NEXT: s_add_u32 s17, s18, s17
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: s_mul_i32 s20, s2, s9
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9
; GFX10-NEXT: s_add_i32 s19, s19, s18
; GFX10-NEXT: s_mul_i32 s18, s3, s8
; GFX10-NEXT: s_mul_i32 s7, s7, s8
; GFX10-NEXT: s_add_u32 s18, s18, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s15, s0, s15
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_add_u32 s18, s18, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s2, s8
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s1, s9
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s22, s0, s10
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s22
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_i32 s22, s2, s10
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s20, s20, s21
; GFX10-NEXT: s_add_u32 s18, s18, s19
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
; GFX10-NEXT: s_mul_i32 s21, s3, s9
; GFX10-NEXT: s_and_b32 s19, s19, 1
; GFX10-NEXT: s_add_i32 s20, s20, s19
; GFX10-NEXT: s_mul_i32 s19, s4, s8
; GFX10-NEXT: s_add_u32 s19, s19, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_u32 s19, s19, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s23, s3, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s24, s2, s9
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s23, s1, s10
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s24, s0, s11
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s23
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s5, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s24
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s3, s10
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s21, s21, s22
; GFX10-NEXT: s_add_u32 s19, s19, s20
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
; GFX10-NEXT: s_mul_i32 s22, s1, s12
; GFX10-NEXT: s_and_b32 s20, s20, 1
; GFX10-NEXT: s_add_i32 s21, s21, s20
; GFX10-NEXT: s_add_u32 s23, s23, s25
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_u32 s23, s23, s24
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s24, s25, s24
; GFX10-NEXT: s_add_u32 s23, s23, s26
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s2, s10
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s22, s23, s22
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s25, s1, s11
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s23, s24, s23
; GFX10-NEXT: s_add_u32 s22, s22, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s0, s12
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s20, s22, s20
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_mul_i32 s24, s6, s8
; GFX10-NEXT: s_and_b32 s22, s22, 1
; GFX10-NEXT: s_add_i32 s22, s23, s22
; GFX10-NEXT: s_add_u32 s20, s20, s28
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s28, s5, s9
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s26
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s26, s4, s10
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s25
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s25, s3, s11
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s27
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_i32 s27, s2, s12
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s22, s22, s23
; GFX10-NEXT: s_add_u32 s20, s20, s21
; GFX10-NEXT: s_cselect_b32 s21, 1, 0
; GFX10-NEXT: s_mul_i32 s23, s1, s13
; GFX10-NEXT: s_and_b32 s21, s21, 1
; GFX10-NEXT: s_add_i32 s22, s22, s21
; GFX10-NEXT: s_add_u32 s21, s24, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s28, s0, s14
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s26, 1, 0
; GFX10-NEXT: s_and_b32 s26, s26, 1
; GFX10-NEXT: s_add_i32 s24, s24, s26
; GFX10-NEXT: s_add_u32 s21, s21, s25
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s5, s8
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s25, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s4, s9
; GFX10-NEXT: s_and_b32 s25, s25, 1
; GFX10-NEXT: s_add_i32 s24, s24, s25
; GFX10-NEXT: s_add_u32 s21, s21, s23
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s25, s3, s10
; GFX10-NEXT: s_and_b32 s23, s23, 1
; GFX10-NEXT: s_add_i32 s23, s24, s23
; GFX10-NEXT: s_add_u32 s21, s21, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s28, s2, s11
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s26, s1, s12
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_hi_u32 s27, s0, s13
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s25
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s25, s6, s9
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s6, s6, s8
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s28
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s26
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s26, s5, s10
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s5, s5, s9
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s27
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_mul_i32 s27, s4, s11
; GFX10-NEXT: s_and_b32 s24, s24, 1
; GFX10-NEXT: s_mul_hi_u32 s4, s4, s10
; GFX10-NEXT: s_add_i32 s23, s23, s24
; GFX10-NEXT: s_add_u32 s21, s21, s22
; GFX10-NEXT: s_cselect_b32 s22, 1, 0
; GFX10-NEXT: s_add_i32 s7, s7, s25
; GFX10-NEXT: s_mul_i32 s24, s3, s12
; GFX10-NEXT: s_add_i32 s7, s7, s26
; GFX10-NEXT: s_mul_i32 s25, s2, s13
; GFX10-NEXT: s_add_i32 s7, s7, s27
; GFX10-NEXT: s_mul_i32 s26, s1, s14
; GFX10-NEXT: s_add_i32 s7, s7, s24
; GFX10-NEXT: s_mul_hi_u32 s3, s3, s11
; GFX10-NEXT: s_add_i32 s7, s7, s25
; GFX10-NEXT: s_mul_hi_u32 s2, s2, s12
; GFX10-NEXT: s_add_i32 s7, s7, s26
; GFX10-NEXT: s_mul_hi_u32 s1, s1, s13
; GFX10-NEXT: s_add_i32 s7, s7, s15
; GFX10-NEXT: s_add_i32 s6, s7, s6
; GFX10-NEXT: s_add_i32 s5, s6, s5
; GFX10-NEXT: s_mov_b32 s6, s21
; GFX10-NEXT: s_add_i32 s4, s5, s4
; GFX10-NEXT: s_mov_b32 s5, s20
; GFX10-NEXT: s_add_i32 s3, s4, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s14
; GFX10-NEXT: s_add_i32 s2, s3, s2
; GFX10-NEXT: s_and_b32 s3, s22, 1
; GFX10-NEXT: s_add_i32 s1, s2, s1
; GFX10-NEXT: s_add_i32 s23, s23, s3
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: s_mul_i32 s0, s0, s8
; GFX10-NEXT: s_add_i32 s7, s1, s23
; GFX10-NEXT: s_mov_b32 s1, s16
; GFX10-NEXT: s_mov_b32 s2, s17
; GFX10-NEXT: s_mov_b32 s3, s18
; GFX10-NEXT: s_mov_b32 s4, s19
; GFX10-NEXT: ; return to shader part epilog
%result = mul i256 %num, %den
%cast = bitcast i256 %result to <8 x i32>
ret <8 x i32> %cast
}
define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX7-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX7-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX7-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX7-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v17
; GFX7-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v16, vcc, v16, v18
; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v17, vcc, v17, v18
; GFX7-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21
; GFX7-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v18, v21
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v20
; GFX7-NEXT: v_add_i32_e32 v17, vcc, v18, v17
; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v20, v3, v8
; GFX7-NEXT: v_mul_lo_u32 v21, v2, v9
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX7-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v2, v8
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v1, v9
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_mul_hi_u32 v22, v0, v10
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v19, v22
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v21
; GFX7-NEXT: v_add_i32_e32 v18, vcc, v19, v18
; GFX7-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v21, v4, v8
; GFX7-NEXT: v_mul_lo_u32 v22, v3, v9
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v0, v12
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v2, v9
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v0, v11
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v20, v23
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v22
; GFX7-NEXT: v_add_i32_e32 v19, vcc, v20, v19
; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX7-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX7-NEXT: v_mul_lo_u32 v23, v4, v9
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_mul_lo_u32 v21, v3, v10
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v23, v22
; GFX7-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v21, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_add_i32_e32 v20, vcc, v21, v20
; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX7-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v23
; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v24, v23
; GFX7-NEXT: v_mul_lo_u32 v24, v3, v11
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX7-NEXT: v_add_i32_e32 v22, vcc, v22, v24
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v24
; GFX7-NEXT: v_add_i32_e32 v21, vcc, v22, v21
; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX7-NEXT: v_add_i32_e32 v23, vcc, v23, v22
; GFX7-NEXT: v_mul_lo_u32 v22, v0, v8
; GFX7-NEXT: v_mul_hi_u32 v8, v6, v8
; GFX7-NEXT: v_mul_lo_u32 v6, v6, v9
; GFX7-NEXT: v_mul_hi_u32 v9, v5, v9
; GFX7-NEXT: v_mul_lo_u32 v5, v5, v10
; GFX7-NEXT: v_mul_hi_u32 v10, v4, v10
; GFX7-NEXT: v_mul_lo_u32 v4, v4, v11
; GFX7-NEXT: v_mul_hi_u32 v11, v3, v11
; GFX7-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX7-NEXT: v_mul_hi_u32 v12, v2, v12
; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX7-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; GFX7-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GFX7-NEXT: v_mul_hi_u32 v13, v1, v13
; GFX7-NEXT: v_mul_lo_u32 v1, v1, v14
; GFX7-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GFX7-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GFX7-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v15
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v8
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v9
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v10
; GFX7-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v12
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v13
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v7, vcc, v0, v23
; GFX7-NEXT: v_mov_b32_e32 v0, v22
; GFX7-NEXT: v_mov_b32_e32 v1, v16
; GFX7-NEXT: v_mov_b32_e32 v2, v17
; GFX7-NEXT: v_mov_b32_e32 v3, v18
; GFX7-NEXT: v_mov_b32_e32 v4, v19
; GFX7-NEXT: v_mov_b32_e32 v5, v20
; GFX7-NEXT: v_mov_b32_e32 v6, v21
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX8-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX8-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX8-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX8-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v17
; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v18
; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v18
; GFX8-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21
; GFX8-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v21
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v20
; GFX8-NEXT: v_add_u32_e32 v17, vcc, v18, v17
; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v20, v3, v8
; GFX8-NEXT: v_mul_lo_u32 v21, v2, v9
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX8-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v2, v8
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v1, v9
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_mul_hi_u32 v22, v0, v10
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v22
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v21
; GFX8-NEXT: v_add_u32_e32 v18, vcc, v19, v18
; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v21, v4, v8
; GFX8-NEXT: v_mul_lo_u32 v22, v3, v9
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v0, v12
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v2, v9
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v0, v11
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v20, v23
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v22
; GFX8-NEXT: v_add_u32_e32 v19, vcc, v20, v19
; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX8-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX8-NEXT: v_mul_lo_u32 v23, v4, v9
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_mul_lo_u32 v21, v3, v10
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v23, v22
; GFX8-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v21, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_add_u32_e32 v20, vcc, v21, v20
; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX8-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v23
; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v24, v23
; GFX8-NEXT: v_mul_lo_u32 v24, v3, v11
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX8-NEXT: v_add_u32_e32 v22, vcc, v22, v24
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v24
; GFX8-NEXT: v_add_u32_e32 v21, vcc, v22, v21
; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v23, vcc, v23, v22
; GFX8-NEXT: v_mul_lo_u32 v22, v0, v8
; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8
; GFX8-NEXT: v_mul_lo_u32 v6, v6, v9
; GFX8-NEXT: v_mul_hi_u32 v9, v5, v9
; GFX8-NEXT: v_mul_lo_u32 v5, v5, v10
; GFX8-NEXT: v_mul_hi_u32 v10, v4, v10
; GFX8-NEXT: v_mul_lo_u32 v4, v4, v11
; GFX8-NEXT: v_mul_hi_u32 v11, v3, v11
; GFX8-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX8-NEXT: v_mul_hi_u32 v12, v2, v12
; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
; GFX8-NEXT: v_mul_hi_u32 v13, v1, v13
; GFX8-NEXT: v_mul_lo_u32 v1, v1, v14
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v15
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v8
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v9
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v10
; GFX8-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v11
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v12
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v13
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v23
; GFX8-NEXT: v_mov_b32_e32 v0, v22
; GFX8-NEXT: v_mov_b32_e32 v1, v16
; GFX8-NEXT: v_mov_b32_e32 v2, v17
; GFX8-NEXT: v_mov_b32_e32 v3, v18
; GFX8-NEXT: v_mov_b32_e32 v4, v19
; GFX8-NEXT: v_mov_b32_e32 v5, v20
; GFX8-NEXT: v_mov_b32_e32 v6, v21
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v16, v2, v8
; GFX9-NEXT: v_mul_lo_u32 v17, v1, v9
; GFX9-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX9-NEXT: v_mul_hi_u32 v19, v1, v8
; GFX9-NEXT: v_mul_lo_u32 v20, v1, v8
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v17
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v16, v18
; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v16, v19
; GFX9-NEXT: v_mul_lo_u32 v21, v0, v9
; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v18, v17, v18, v16
; GFX9-NEXT: v_mul_hi_u32 v16, v0, v8
; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v20, v21
; GFX9-NEXT: v_mul_hi_u32 v21, v0, v9
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, v17, v16
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21
; GFX9-NEXT: v_add_u32_e32 v17, v20, v17
; GFX9-NEXT: v_mul_lo_u32 v21, v3, v8
; GFX9-NEXT: v_mul_lo_u32 v22, v2, v9
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, v19, v17
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v18, v18, v20, v19
; GFX9-NEXT: v_mul_lo_u32 v19, v1, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v22
; GFX9-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v22
; GFX9-NEXT: v_mul_hi_u32 v23, v2, v8
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v21, v20, v22
; GFX9-NEXT: v_mul_hi_u32 v21, v1, v9
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v0, v10
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v20, v22, v21
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23
; GFX9-NEXT: v_mul_lo_u32 v22, v4, v8
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v9
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, v19, v18
; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v19, v20, v21, v19
; GFX9-NEXT: v_mul_lo_u32 v20, v2, v10
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v1, v11
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v22, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v22, v0, v12
; GFX9-NEXT: v_mul_hi_u32 v23, v3, v8
; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23
; GFX9-NEXT: v_mul_hi_u32 v22, v2, v9
; GFX9-NEXT: v_mul_hi_u32 v23, v1, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v21, v22, v23
; GFX9-NEXT: v_mul_hi_u32 v22, v0, v11
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v10
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v20, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19
; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v20, v21, v22, v20
; GFX9-NEXT: v_mul_lo_u32 v21, v5, v8
; GFX9-NEXT: v_mul_lo_u32 v22, v4, v9
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v22
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v1, v12
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v4, v8
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v3, v9
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v2, v10
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v1, v11
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v22, v24, v23
; GFX9-NEXT: v_mul_hi_u32 v23, v0, v12
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v21, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20
; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v21, v22, v23, v21
; GFX9-NEXT: v_mul_lo_u32 v22, v6, v8
; GFX9-NEXT: v_mul_lo_u32 v23, v5, v9
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v4, v10
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v3, v11
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v23
; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v24, v25, v23
; GFX9-NEXT: v_mul_lo_u32 v24, v2, v12
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_lo_u32 v24, v1, v13
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v5, v8
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v4, v9
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v3, v10
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v2, v11
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v1, v12
; GFX9-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v23, v23, v25, v24
; GFX9-NEXT: v_mul_hi_u32 v24, v0, v13
; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, v22, v24
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, v22, v21
; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
; GFX9-NEXT: v_add3_u32 v22, v23, v24, v22
; GFX9-NEXT: v_mul_lo_u32 v23, v6, v9
; GFX9-NEXT: v_mul_lo_u32 v24, v4, v11
; GFX9-NEXT: v_mul_hi_u32 v4, v4, v10
; GFX9-NEXT: v_mul_hi_u32 v6, v6, v8
; GFX9-NEXT: v_add_u32_e32 v7, v7, v23
; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10
; GFX9-NEXT: v_mul_hi_u32 v5, v5, v9
; GFX9-NEXT: v_mul_hi_u32 v9, v3, v11
; GFX9-NEXT: v_mul_hi_u32 v10, v2, v12
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v12
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13
; GFX9-NEXT: v_mul_hi_u32 v11, v1, v13
; GFX9-NEXT: v_mul_lo_u32 v12, v1, v14
; GFX9-NEXT: v_mul_lo_u32 v13, v0, v15
; GFX9-NEXT: v_add3_u32 v7, v7, v23, v24
; GFX9-NEXT: v_add3_u32 v2, v7, v3, v2
; GFX9-NEXT: v_mul_lo_u32 v1, v0, v8
; GFX9-NEXT: v_add3_u32 v2, v2, v12, v13
; GFX9-NEXT: v_mul_hi_u32 v0, v0, v14
; GFX9-NEXT: v_add3_u32 v2, v2, v6, v5
; GFX9-NEXT: v_add3_u32 v2, v2, v4, v9
; GFX9-NEXT: v_add3_u32 v2, v2, v10, v11
; GFX9-NEXT: v_add3_u32 v7, v2, v0, v22
; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v16
; GFX9-NEXT: v_mov_b32_e32 v2, v17
; GFX9-NEXT: v_mov_b32_e32 v3, v18
; GFX9-NEXT: v_mov_b32_e32 v4, v19
; GFX9-NEXT: v_mov_b32_e32 v5, v20
; GFX9-NEXT: v_mov_b32_e32 v6, v21
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX10-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX10-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX10-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX10-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX10-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX10-NEXT: v_mul_lo_u32 v22, v3, v8
; GFX10-NEXT: v_mul_lo_u32 v25, v1, v10
; GFX10-NEXT: v_mul_hi_u32 v23, v0, v9
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v17
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9
; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v18
; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s4, v19, v20
; GFX10-NEXT: v_mul_lo_u32 v20, v2, v9
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX10-NEXT: v_add_nc_u32_e32 v17, v17, v18
; GFX10-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX10-NEXT: v_add_co_u32 v18, s4, v19, v18
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v22, v20
; GFX10-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v21
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v25
; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v23
; GFX10-NEXT: v_mul_hi_u32 v23, v1, v9
; GFX10-NEXT: v_add3_u32 v19, v24, v19, v21
; GFX10-NEXT: v_mul_hi_u32 v21, v2, v8
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v22
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v21
; GFX10-NEXT: v_add3_u32 v21, v26, v24, v25
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v23
; GFX10-NEXT: v_add3_u32 v18, v19, v22, v18
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add3_u32 v19, v21, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v21, v2, v10
; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v11
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v26, v3, v8
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v21
; GFX10-NEXT: v_mul_lo_u32 v22, v0, v12
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v18, s4, v20, v18
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v9
; GFX10-NEXT: v_add3_u32 v24, v25, v27, v24
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v19, v19, v23, v20
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v26
; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v11
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX10-NEXT: v_add3_u32 v23, v24, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v9
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v24
; GFX10-NEXT: v_add3_u32 v21, v23, v25, v21
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26
; GFX10-NEXT: v_mul_lo_u32 v26, v1, v12
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23
; GFX10-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v26
; GFX10-NEXT: v_mul_hi_u32 v26, v4, v8
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v20, v21, v25, v20
; GFX10-NEXT: v_add_co_u32 v22, s4, v22, v23
; GFX10-NEXT: v_add3_u32 v23, v24, v27, v28
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8
; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9
; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v26
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10
; GFX10-NEXT: v_add3_u32 v23, v23, v30, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v29
; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_add3_u32 v23, v23, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v10
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28
; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v26
; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v25, v24
; GFX10-NEXT: v_add_co_u32 v21, s5, v21, v29
; GFX10-NEXT: v_add3_u32 v22, v23, v22, v26
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v12
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v24, v27
; GFX10-NEXT: v_mul_lo_u32 v27, v1, v13
; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20
; GFX10-NEXT: v_add_co_u32 v23, s4, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_mul_hi_u32 v27, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v21, v22, v26, v21
; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v24
; GFX10-NEXT: v_add3_u32 v24, v28, v25, v29
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10
; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_add3_u32 v24, v24, v30, v32
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v31
; GFX10-NEXT: v_add3_u32 v22, v24, v28, v27
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v25
; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v26
; GFX10-NEXT: v_add3_u32 v22, v22, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v11
; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v28
; GFX10-NEXT: v_mul_lo_u32 v28, v2, v13
; GFX10-NEXT: v_mul_hi_u32 v5, v5, v9
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v27
; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v7, v7, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v14
; GFX10-NEXT: v_mul_hi_u32 v25, v0, v13
; GFX10-NEXT: v_mul_hi_u32 v2, v2, v12
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v13
; GFX10-NEXT: v_add3_u32 v7, v7, v29, v28
; GFX10-NEXT: v_add3_u32 v22, v22, v26, v27
; GFX10-NEXT: v_add3_u32 v7, v7, v24, v15
; GFX10-NEXT: v_add_co_u32 v9, s4, v23, v25
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5
; GFX10-NEXT: v_add_co_u32 v6, s4, v9, v21
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v3, v5, v4, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v14
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v5, v22, v10, v7
; GFX10-NEXT: v_add3_u32 v1, v3, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v2, v17
; GFX10-NEXT: v_mov_b32_e32 v3, v18
; GFX10-NEXT: v_add3_u32 v7, v1, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v1, v16
; GFX10-NEXT: v_mov_b32_e32 v4, v19
; GFX10-NEXT: v_mov_b32_e32 v5, v20
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
}