Files
clang-p2996/llvm/test/CodeGen/AMDGPU/mul_int24.ll
Jay Foad e301e071ba [AMDGPU] Remove IR SpeculativeExecution pass from codegen pipeline
This pass seems to have very little effect because all it does is hoist
some instructions, but it is followed later in the codegen pipeline by
the IR CodeSinking pass which does the opposite.

Differential Revision: https://reviews.llvm.org/D130258
2022-08-02 17:35:20 +01:00

844 lines
31 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; SI-LABEL: test_smul24_i32:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s2, s2, 0x180000
; SI-NEXT: s_bfe_i32 s3, s3, 0x180000
; SI-NEXT: s_mul_i32 s2, s2, s3
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_bfe_i32 s0, s2, 0x180000
; VI-NEXT: s_bfe_i32 s1, s3, 0x180000
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T1.X, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smul24_i32:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL T0.Z, KC0[2].Z, literal.x,
; CM-NEXT: LSHL * T0.W, KC0[2].W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
entry:
%a.shl = shl i32 %a, 8
%a.24 = ashr i32 %a.shl, 8
%b.shl = shl i32 %b, 8
%b.24 = ashr i32 %b.shl, 8
%mul24 = mul i32 %a.24, %b.24
store i32 %mul24, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; SI-LABEL: test_smulhi24_i64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smulhi24_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smulhi24_i64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smulhi24_i64:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[2].Z, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MULHI_INT * T1.X, PS, PV.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smulhi24_i64:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
entry:
%a.shl = shl i32 %a, 8
%a.24 = ashr i32 %a.shl, 8
%b.shl = shl i32 %b, 8
%b.24 = ashr i32 %b.shl, 8
%a.24.i64 = sext i32 %a.24 to i64
%b.24.i64 = sext i32 %b.24 to i64
%mul48 = mul i64 %a.24.i64, %b.24.i64
%mul48.hi = lshr i64 %mul48, 32
%mul24hi = trunc i64 %mul48.hi to i32
store i32 %mul24hi, i32 addrspace(1)* %out
ret void
}
define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
; SI-LABEL: test_smul48_i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mul_i32_i24_e32 v3, v0, v2
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
; SI-NEXT: v_mov_b32_e32 v0, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_smul48_i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mul_i32_i24_e32 v3, v0, v2
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_smul48_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mul_i32_i24_e32 v3, v0, v2
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: test_smul48_i64:
; EG: ; %bb.0:
; EG-NEXT: CF_END
; EG-NEXT: PAD
;
; CM-LABEL: test_smul48_i64:
; CM: ; %bb.0:
; CM-NEXT: CF_END
; CM-NEXT: PAD
%shl.lhs = shl i64 %lhs, 40
%lhs24 = ashr i64 %shl.lhs, 40
%shl.rhs = shl i64 %rhs, 40
%rhs24 = ashr i64 %shl.rhs, 40
%mul = mul i64 %lhs24, %rhs24
ret i64 %mul
}
define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; SI-LABEL: test_smul48_v2i64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v0
; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v6
; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
; SI-NEXT: v_ashr_i64 v[5:6], v[0:1], 40
; SI-NEXT: v_ashr_i64 v[1:2], v[1:2], 40
; SI-NEXT: v_ashr_i64 v[6:7], v[2:3], 40
; SI-NEXT: v_ashr_i64 v[2:3], v[3:4], 40
; SI-NEXT: v_mul_i32_i24_e32 v0, v1, v2
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v2
; SI-NEXT: v_mul_i32_i24_e32 v2, v5, v6
; SI-NEXT: v_mul_hi_i32_i24_e32 v3, v5, v6
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: test_smul48_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; VI-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1]
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3]
; VI-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2]
; VI-NEXT: v_mul_i32_i24_e32 v0, v1, v3
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3
; VI-NEXT: v_mul_i32_i24_e32 v2, v7, v4
; VI-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_smul48_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2
; GFX9-NEXT: v_ashrrev_i64 v[7:8], 40, v[0:1]
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX9-NEXT: v_ashrrev_i64 v[1:2], 40, v[0:1]
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; GFX9-NEXT: v_ashrrev_i64 v[3:4], 40, v[2:3]
; GFX9-NEXT: v_ashrrev_i64 v[4:5], 40, v[1:2]
; GFX9-NEXT: v_mul_i32_i24_e32 v0, v1, v3
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v1, v3
; GFX9-NEXT: v_mul_i32_i24_e32 v2, v7, v4
; GFX9-NEXT: v_mul_hi_i32_i24_e32 v3, v7, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; EG-LABEL: test_smul48_v2i64:
; EG: ; %bb.0:
; EG-NEXT: CF_END
; EG-NEXT: PAD
;
; CM-LABEL: test_smul48_v2i64:
; CM: ; %bb.0:
; CM-NEXT: CF_END
; CM-NEXT: PAD
%shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40>
%lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40>
%shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40>
%rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40>
%mul = mul <2 x i64> %lhs24, %rhs24
ret <2 x i64> %mul
}
; This requires handling of the original 64-bit mul node to eliminate
; unnecessary extension instructions because after legalization they
; will not be removed by SimplifyDemandedBits because there are
; multiple uses by the separate mul and mulhi.
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
; SI-LABEL: test_smul24_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0x13
; SI-NEXT: s_load_dword s0, s[0:1], 0x1c
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s1, s2, 0x180000
; SI-NEXT: s_bfe_i32 s0, s0, 0x180000
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: s_mul_i32 s1, s0, s1
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
; VI-NEXT: s_load_dword s5, s[0:1], 0x70
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
; VI-NEXT: s_bfe_i32 s5, s5, 0x180000
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0
; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000
; GFX9-NEXT: s_mul_hi_i32 s2, s1, s0
; GFX9-NEXT: s_mul_i32 s1, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[4].Z, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[6].W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MULHI_INT * T0.Y, PV.W, PS,
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T0.X, T1.W, T0.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smul24_i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL T0.Z, KC0[4].Z, literal.x,
; CM-NEXT: LSHL * T0.W, KC0[6].W, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR T1.Z, PV.W, literal.y,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; CM-NEXT: MULLO_INT T1.X, T1.Z, T0.W,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T1.Z, T0.W,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T1.Z, T0.W,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T1.Z, T0.W,
; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z,
; CM-NEXT: MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z,
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z,
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z,
%shl.i = shl i32 %a, 8
%shr.i = ashr i32 %shl.i, 8
%conv.i = sext i32 %shr.i to i64
%shl1.i = shl i32 %b, 8
%shr2.i = ashr i32 %shl1.i, 8
%conv3.i = sext i32 %shr2.i to i64
%mul.i = mul i64 %conv3.i, %conv.i
store i64 %mul.i, i64 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
; SI-LABEL: test_smul24_i64_square:
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s4, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s4, s4, 0x180000
; SI-NEXT: s_mul_i32 s5, s4, s4
; SI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i64_square:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64_square:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000
; GFX9-NEXT: s_mul_hi_i32 s1, s0, s0
; GFX9-NEXT: s_mul_i32 s0, s0, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i64_square:
; EG: ; %bb.0:
; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MULHI_INT * T0.Y, PV.W, PV.W,
; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; EG-NEXT: MULLO_INT * T0.X, T0.W, T0.W,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smul24_i64_square:
; CM: ; %bb.0:
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL * T0.W, KC0[2].Z, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T0.W, PV.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
; CM-NEXT: MULLO_INT T1.X, T0.W, T0.W,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T0.W,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T0.W,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T0.W,
; CM-NEXT: MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z,
; CM-NEXT: MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z,
; CM-NEXT: MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z,
; CM-NEXT: MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z,
%shl.i = shl i32 %a, 8
%shr.i = ashr i32 %shl.i, 8
%conv.i = sext i32 %shr.i to i64
%mul.i = mul i64 %conv.i, %conv.i
store i64 %mul.i, i64 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
; SI-LABEL: test_smul24_i33:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_load_dword s0, s[0:1], 0xd
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshl_b32 s1, s2, 8
; SI-NEXT: s_lshl_b32 s3, s0, 8
; SI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
; SI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_mul_i32 s1, s0, s2
; SI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 31
; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], 31
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smul24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dword s4, s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s3, s2, 8
; VI-NEXT: s_lshl_b32 s5, s4, 8
; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40
; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i33:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s1, s2, 8
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
; GFX9-NEXT: s_lshl_b32 s1, s3, 8
; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40
; GFX9-NEXT: s_mul_hi_i32 s1, s0, s2
; GFX9-NEXT: s_mul_i32 s0, s0, s2
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 31
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 31
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smul24_i33:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
; EG-NEXT: MULLO_INT * T1.X, T0.W, T1.W,
; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smul24_i33:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 16, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: LSHL T0.Z, KC0[2].W, literal.x,
; CM-NEXT: LSHL * T0.W, KC0[3].Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULLO_INT T1.X, T0.W, T1.Z,
; CM-NEXT: MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T1.Y, T0.X, 0.0, 1,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%a.shl = shl i33 %a, 9
%a.24 = ashr i33 %a.shl, 9
%b.shl = shl i33 %b, 9
%b.24 = ashr i33 %b.shl, 9
%mul24 = mul i33 %a.24, %b.24
%ext = sext i33 %mul24 to i64
store i64 %ext, i64 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
; SI-LABEL: test_smulhi24_i33:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s4, s[0:1], 0xd
; SI-NEXT: s_load_dword s5, s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0
; SI-NEXT: v_and_b32_e32 v0, 1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_smulhi24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s4, s[0:1], 0x34
; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smulhi24_i33:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s1, s2, 8
; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
; GFX9-NEXT: s_lshl_b32 s1, s3, 8
; GFX9-NEXT: s_ashr_i64 s[2:3], s[0:1], 40
; GFX9-NEXT: s_mul_hi_i32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: test_smulhi24_i33:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: ALU clause starting at 4:
; EG-NEXT: LSHL T0.W, KC0[2].W, literal.x,
; EG-NEXT: LSHL * T1.W, KC0[3].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MULHI_INT * T0.X, PS, PV.W,
; EG-NEXT: AND_INT T0.X, PS, 1,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: test_smulhi24_i33:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: ALU clause starting at 4:
; CM-NEXT: MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
; CM-NEXT: AND_INT * T0.X, PV.X, 1,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%tmp0 = shl i33 %a, 9
%a_24 = ashr i33 %tmp0, 9
%tmp1 = shl i33 %b, 9
%b_24 = ashr i33 %tmp1, 9
%tmp2 = mul i33 %a_24, %b_24
%hi = lshr i33 %tmp2, 32
%trunc = trunc i33 %hi to i32
store i32 %trunc, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
; SI-LABEL: simplify_i24_crash:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s2, 0
; SI-NEXT: s_cbranch_scc0 .LBB8_2
; SI-NEXT: ; %bb.1: ; %bb7
; SI-NEXT: s_endpgm
; SI-NEXT: .LBB8_2: ; %bb11
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i32 s2, s4, 0x180000
; SI-NEXT: s_bfe_i32 s4, s6, 0x180000
; SI-NEXT: s_mul_i32 s4, s2, s4
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s4
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: simplify_i24_crash:
; VI: ; %bb.0: ; %bb
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB8_2
; VI-NEXT: ; %bb.1: ; %bb7
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB8_2: ; %bb11
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
; VI-NEXT: s_bfe_i32 s5, s6, 0x180000
; VI-NEXT: s_mul_i32 s4, s4, s5
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: simplify_i24_crash:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB8_2
; GFX9-NEXT: ; %bb.1: ; %bb7
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB8_2: ; %bb11
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s0, s4, 0x180000
; GFX9-NEXT: s_bfe_i32 s1, s6, 0x180000
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: simplify_i24_crash:
; EG: ; %bb.0: ; %bb
; EG-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
; EG-NEXT: JUMP @5 POP:1
; EG-NEXT: ALU 14, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
; EG-NEXT: POP @5 POP:1
; EG-NEXT: CF_END
; EG-NEXT: ALU clause starting at 6:
; EG-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV T0.X, KC0[3].Y,
; EG-NEXT: MOV * T1.X, KC0[2].W,
; EG-NEXT: LSHL T0.W, PS, literal.x,
; EG-NEXT: LSHL * T1.W, PV.X, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ASHR T1.W, PS, literal.x,
; EG-NEXT: ASHR * T0.W, PV.W, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: MOV T2.W, KC0[2].Y,
; EG-NEXT: MULLO_INT * T0.X, PS, PV.W,
; EG-NEXT: LSHR T1.X, PV.W, literal.x,
; EG-NEXT: MOV T0.Y, PS,
; EG-NEXT: MOV T0.W, KC0[3].X,
; EG-NEXT: MOV * T0.W, KC0[3].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: simplify_i24_crash:
; CM: ; %bb.0: ; %bb
; CM-NEXT: ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
; CM-NEXT: JUMP @5 POP:1
; CM-NEXT: ALU 17, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: POP @5 POP:1
; CM-NEXT: CF_END
; CM-NEXT: ALU clause starting at 6:
; CM-NEXT: SETNE_INT * T0.W, KC0[2].Z, 0.0,
; CM-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[3].Y,
; CM-NEXT: MOV * T1.X, KC0[2].W,
; CM-NEXT: LSHL T0.Z, PV.X, literal.x,
; CM-NEXT: LSHL * T0.W, T0.X, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MOV T0.Y, KC0[2].Y,
; CM-NEXT: ASHR T1.Z, PV.W, literal.x,
; CM-NEXT: ASHR * T0.W, PV.Z, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: MULLO_INT T0.X, T0.W, T1.Z,
; CM-NEXT: MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
; CM-NEXT: MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
; CM-NEXT: LSHR T1.X, T0.Y, literal.x,
; CM-NEXT: MOV T0.Y, PV.X,
; CM-NEXT: MOV T0.Z, KC0[3].X,
; CM-NEXT: MOV * T0.W, KC0[3].Z,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
bb:
%cmp = icmp eq i32 %arg0, 0
br i1 %cmp, label %bb11, label %bb7
bb11:
%tmp14 = shufflevector <2 x i32> %arg1, <2 x i32> undef, <2 x i32> zeroinitializer
%tmp16 = shufflevector <2 x i32> %arg2, <2 x i32> undef, <2 x i32> zeroinitializer
%tmp17 = shl <2 x i32> %tmp14, <i32 8, i32 8>
%tmp18 = ashr <2 x i32> %tmp17, <i32 8, i32 8>
%tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8>
%tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8>
%tmp21 = mul <2 x i32> %tmp18, %tmp20
store <2 x i32> %tmp21, <2 x i32> addrspace(1)* %out
br label %bb7
bb7:
ret void
}
attributes #0 = { nounwind }