diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 701a76c4cc6b..4f91f90b0469 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18403,49 +18403,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { if (SDValue C = DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, DL, VT, {N0, N1})) return C; - if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) { - const APFloat &V = N1C->getValueAPF(); - // copysign(x, c1) -> fabs(x) iff ispos(c1) - // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1) - if (!V.isNegative()) { - if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT)) - return DAG.getNode(ISD::FABS, DL, VT, N0); - } else { - if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) - return DAG.getNode(ISD::FNEG, DL, VT, - DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0)); - } - } - - // copysign(fabs(x), y) -> copysign(x, y) - // copysign(fneg(x), y) -> copysign(x, y) - // copysign(copysign(x,z), y) -> copysign(x, y) - if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG || - N0.getOpcode() == ISD::FCOPYSIGN) - return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1); - - // copysign(x, abs(y)) -> abs(x) - if (N1.getOpcode() == ISD::FABS) - return DAG.getNode(ISD::FABS, DL, VT, N0); - - // copysign(x, copysign(y,z)) -> copysign(x, z) - if (N1.getOpcode() == ISD::FCOPYSIGN) - return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1)); - // copysign(x, fp_extend(y)) -> copysign(x, y) // copysign(x, fp_round(y)) -> copysign(x, y) if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(0)); - // We only take the sign bit from the sign operand. - EVT SignVT = N1.getValueType(); - if (SimplifyDemandedBits(N1, - APInt::getSignMask(SignVT.getScalarSizeInBits()))) - return SDValue(N, 0); - - // We only take the non-sign bits from the value operand - if (SimplifyDemandedBits(N0, - APInt::getSignedMaxValue(VT.getScalarSizeInBits()))) + if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); @@ -18972,6 +18935,9 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { N0.getOperand(0)); } + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); + if (SDValue Cast = foldSignChangeInBitcast(N)) return Cast; @@ -19045,14 +19011,8 @@ SDValue DAGCombiner::visitFABS(SDNode *N) { if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0})) return C; - // fold (fabs (fabs x)) -> (fabs x) - if (N0.getOpcode() == ISD::FABS) - return N->getOperand(0); - - // fold (fabs (fneg x)) -> (fabs x) - // fold (fabs (fcopysign x, y)) -> (fabs x) - if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) - return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0)); + if (SimplifyDemandedBits(SDValue(N, 0))) + return SDValue(N, 0); if (SDValue Cast = foldSignChangeInBitcast(N)) return Cast; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1e470318ced0..66717135c9ad 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2967,6 +2967,77 @@ bool TargetLowering::SimplifyDemandedBits( } break; } + case ISD::FABS: { + SDValue Op0 = Op.getOperand(0); + APInt SignMask = APInt::getSignMask(BitWidth); + + if (!DemandedBits.intersects(SignMask)) + return TLO.CombineTo(Op, Op0); + + if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO, + Depth + 1)) + return true; + + if (Known.isNonNegative()) + return TLO.CombineTo(Op, Op0); + if (Known.isNegative()) + return TLO.CombineTo( + Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0, Op->getFlags())); + + Known.Zero |= SignMask; + Known.One &= ~SignMask; + + break; + } + case ISD::FCOPYSIGN: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + unsigned BitWidth0 = Op0.getScalarValueSizeInBits(); + unsigned BitWidth1 = Op1.getScalarValueSizeInBits(); + APInt SignMask0 = APInt::getSignMask(BitWidth0); + APInt SignMask1 = APInt::getSignMask(BitWidth1); + + if (!DemandedBits.intersects(SignMask0)) + return TLO.CombineTo(Op, Op0); + + if (SimplifyDemandedBits(Op0, ~SignMask0 & DemandedBits, DemandedElts, + Known, TLO, Depth + 1) || + SimplifyDemandedBits(Op1, SignMask1, DemandedElts, Known2, TLO, + Depth + 1)) + return true; + + if (Known2.isNonNegative()) + return TLO.CombineTo( + Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0, Op->getFlags())); + + if (Known2.isNegative()) + return TLO.CombineTo( + Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, + TLO.DAG.getNode(ISD::FABS, SDLoc(Op0), VT, Op0))); + + Known.Zero &= ~SignMask0; + Known.One &= ~SignMask0; + break; + } + case ISD::FNEG: { + SDValue Op0 = Op.getOperand(0); + APInt SignMask = APInt::getSignMask(BitWidth); + + if (!DemandedBits.intersects(SignMask)) + return TLO.CombineTo(Op, Op0); + + if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO, + Depth + 1)) + return true; + + if (!Known.isSignUnknown()) { + Known.Zero ^= SignMask; + Known.One ^= SignMask; + } + + break; + } default: // We also ask the target about intrinsics (which could be specific to it). if (Op.getOpcode() >= ISD::BUILTIN_OP_END || diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index 5e5fdd6d3170..0189f52bbac0 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -391,13 +391,10 @@ define float @extract_v4i32_copysign_build_vector(<4 x float> %a, <4 x float> %b ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: sub sp, sp, #16 ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: adrp x8, .LCPI16_0 -; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24 -; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI16_0] +; CHECK-SD-NEXT: fabs v0.4s, v0.4s ; CHECK-SD-NEXT: mov x8, sp +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: bfi x8, x0, #2, #2 -; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b ; CHECK-SD-NEXT: str q0, [sp] ; CHECK-SD-NEXT: ldr s0, [x8] ; CHECK-SD-NEXT: add sp, sp, #16 @@ -425,10 +422,7 @@ entry: define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) { ; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: adrp x8, .LCPI17_0 -; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24 -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI17_0] -; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: fabs v0.4s, v0.4s ; CHECK-SD-NEXT: mov s0, v0.s[2] ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 7c89a41d62fb..f901626e54a6 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4388,12 +4388,11 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 -; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 @@ -5267,13 +5266,12 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i ; ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010 -; GFX8-NEXT: s_add_i32 s4, s4, s1 -; GFX8-NEXT: s_or_b32 s3, s1, 0x400000 -; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff +; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10010 +; GFX8-NEXT: s_add_i32 s3, s3, s1 +; GFX8-NEXT: s_addk_i32 s3, 0x7fff ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX8-NEXT: s_cselect_b32 s1, s3, s6 +; GFX8-NEXT: s_cselect_b32 s1, s1, s3 ; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010 ; GFX8-NEXT: s_add_i32 s3, s3, s2 ; GFX8-NEXT: s_addk_i32 s3, 0x7fff @@ -6340,18 +6338,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 -; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 @@ -7687,24 +7683,22 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc -; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 -; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index e74d5ba24079..a3ec35da29f6 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -3227,40 +3227,38 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_and_b32_e32 v3, 0xffe, v3 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-NEXT: v_bfe_u32 v5, v1, 20, 11 +; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: v_sub_u32_e32 v8, vcc, s4, v5 +; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v1 ; VI-NEXT: v_or_b32_e32 v3, 0x1000, v0 -; VI-NEXT: v_med3_i32 v8, v8, 0, 13 -; VI-NEXT: v_lshrrev_b32_e32 v9, v8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v8, v8, v9 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v3 +; VI-NEXT: v_med3_i32 v5, v5, 0, 13 +; VI-NEXT: v_lshrrev_b32_e32 v8, v5, v3 +; VI-NEXT: v_lshlrev_b32_e32 v5, v5, v8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3 ; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5 -; VI-NEXT: v_lshlrev_b32_e32 v8, 12, v5 -; VI-NEXT: v_or_b32_e32 v3, v9, v3 -; VI-NEXT: v_or_b32_e32 v8, v0, v8 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 -; VI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; VI-NEXT: v_and_b32_e32 v8, 7, v3 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 -; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 +; VI-NEXT: v_lshlrev_b32_e32 v5, 12, v1 +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_or_b32_e32 v5, v0, v5 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; VI-NEXT: v_and_b32_e32 v5, 7, v3 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5 ; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v5, v5, v8 ; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v8 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v5 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 ; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0x8000 -; VI-NEXT: v_and_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_mov_b32 s4, 0x7fff7fff ; VI-NEXT: v_bfi_b32 v0, s4, v0, v4 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4050,41 +4048,38 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014 ; VI-NEXT: v_readfirstlane_b32 s0, v0 -; VI-NEXT: s_bfe_u32 s3, s1, 0xb0014 +; VI-NEXT: s_sub_i32 s3, 0x3f1, s1 ; VI-NEXT: s_or_b32 s0, s7, s0 -; VI-NEXT: s_sub_i32 s7, 0x3f1, s3 -; VI-NEXT: v_med3_i32 v0, s7, 0, 13 +; VI-NEXT: v_med3_i32 v0, s3, 0, 13 ; VI-NEXT: s_or_b32 s2, s0, 0x1000 -; VI-NEXT: v_readfirstlane_b32 s7, v0 -; VI-NEXT: s_lshr_b32 s8, s2, s7 -; VI-NEXT: s_lshl_b32 s7, s8, s7 -; VI-NEXT: s_cmp_lg_u32 s7, s2 +; VI-NEXT: v_readfirstlane_b32 s3, v0 +; VI-NEXT: s_lshr_b32 s7, s2, s3 +; VI-NEXT: s_lshl_b32 s3, s7, s3 +; VI-NEXT: s_cmp_lg_u32 s3, s2 ; VI-NEXT: s_cselect_b32 s2, 1, 0 -; VI-NEXT: s_addk_i32 s3, 0xfc10 -; VI-NEXT: s_lshl_b32 s7, s3, 12 -; VI-NEXT: s_or_b32 s2, s8, s2 -; VI-NEXT: s_or_b32 s7, s0, s7 -; VI-NEXT: s_cmp_lt_i32 s3, 1 -; VI-NEXT: s_cselect_b32 s2, s2, s7 -; VI-NEXT: s_and_b32 s7, s2, 7 -; VI-NEXT: s_cmp_gt_i32 s7, 5 -; VI-NEXT: s_cselect_b32 s8, 1, 0 -; VI-NEXT: s_cmp_eq_u32 s7, 3 +; VI-NEXT: s_addk_i32 s1, 0xfc10 +; VI-NEXT: s_lshl_b32 s3, s1, 12 +; VI-NEXT: s_or_b32 s2, s7, s2 +; VI-NEXT: s_or_b32 s3, s0, s3 +; VI-NEXT: s_cmp_lt_i32 s1, 1 +; VI-NEXT: s_cselect_b32 s2, s2, s3 +; VI-NEXT: s_and_b32 s3, s2, 7 +; VI-NEXT: s_cmp_gt_i32 s3, 5 ; VI-NEXT: s_cselect_b32 s7, 1, 0 -; VI-NEXT: s_or_b32 s7, s7, s8 +; VI-NEXT: s_cmp_eq_u32 s3, 3 +; VI-NEXT: s_cselect_b32 s3, 1, 0 +; VI-NEXT: s_or_b32 s3, s3, s7 ; VI-NEXT: s_lshr_b32 s2, s2, 2 -; VI-NEXT: s_add_i32 s2, s2, s7 -; VI-NEXT: s_cmp_lt_i32 s3, 31 +; VI-NEXT: s_add_i32 s2, s2, s3 +; VI-NEXT: s_cmp_lt_i32 s1, 31 ; VI-NEXT: s_cselect_b32 s2, s2, 0x7c00 ; VI-NEXT: s_cmp_lg_u32 s0, 0 ; VI-NEXT: s_cselect_b32 s0, s6, 0x7c00 -; VI-NEXT: s_cmpk_eq_i32 s3, 0x40f +; VI-NEXT: s_cmpk_eq_i32 s1, 0x40f ; VI-NEXT: s_cselect_b32 s0, s0, s2 -; VI-NEXT: s_lshr_b32 s1, s1, 16 -; VI-NEXT: s_and_b32 s1, s1, 0x8000 -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_and_b32 s0, s0, 0x7fff ; VI-NEXT: s_or_b32 s0, s0, s5 ; VI-NEXT: s_mov_b32 s1, 0x7fff7fff ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -4918,40 +4913,37 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-NEXT: v_bfe_u32 v8, v1, 20, 11 +; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 -; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v8 +; VI-NEXT: v_sub_u32_e32 v8, vcc, s4, v1 ; VI-NEXT: v_or_b32_e32 v5, 0x1000, v0 -; VI-NEXT: v_med3_i32 v11, v11, 0, 13 -; VI-NEXT: v_lshrrev_b32_e32 v12, v11, v5 -; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v5 +; VI-NEXT: v_med3_i32 v8, v8, 0, 13 +; VI-NEXT: v_lshrrev_b32_e32 v11, v8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v8, v8, v11 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v5 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, s5, v8 -; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v8 -; VI-NEXT: v_or_b32_e32 v5, v12, v5 -; VI-NEXT: v_or_b32_e32 v11, v0, v11 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v8 -; VI-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc -; VI-NEXT: v_and_b32_e32 v11, 7, v5 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; VI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 +; VI-NEXT: v_lshlrev_b32_e32 v8, 12, v1 +; VI-NEXT: v_or_b32_e32 v5, v11, v5 +; VI-NEXT: v_or_b32_e32 v8, v0, v8 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; VI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; VI-NEXT: v_and_b32_e32 v8, 7, v5 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8 ; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v8, v8, v11 ; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v11 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v8 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 ; VI-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; VI-NEXT: v_mov_b32_e32 v5, 0x8000 -; VI-NEXT: v_and_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_and_b32_e32 v5, 0x1ff, v3 ; VI-NEXT: v_or_b32_e32 v2, v5, v2 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: v_and_b32_e32 v1, 0xffe, v1 @@ -4986,7 +4978,8 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_mov_b32 s4, 0x7fff7fff ; VI-NEXT: v_bfi_b32 v0, s4, v0, v6 ; VI-NEXT: v_bfi_b32 v1, s4, v4, v7 @@ -6061,76 +6054,73 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: v_and_b32_e32 v10, 0xffe, v10 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; VI-NEXT: v_bfe_u32 v11, v5, 20, 11 +; VI-NEXT: v_bfe_u32 v5, v5, 20, 11 ; VI-NEXT: s_movk_i32 s4, 0x3f1 ; VI-NEXT: v_or_b32_e32 v4, v10, v4 -; VI-NEXT: v_sub_u32_e32 v12, vcc, s4, v11 +; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v5 ; VI-NEXT: v_or_b32_e32 v10, 0x1000, v4 -; VI-NEXT: v_med3_i32 v12, v12, 0, 13 -; VI-NEXT: v_lshrrev_b32_e32 v13, v12, v10 -; VI-NEXT: v_lshlrev_b32_e32 v12, v12, v13 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v10 +; VI-NEXT: v_med3_i32 v11, v11, 0, 13 +; VI-NEXT: v_lshrrev_b32_e32 v12, v11, v10 +; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v12 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v10 ; VI-NEXT: s_movk_i32 s5, 0xfc10 ; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v11, vcc, s5, v11 -; VI-NEXT: v_lshlrev_b32_e32 v12, 12, v11 -; VI-NEXT: v_or_b32_e32 v10, v13, v10 -; VI-NEXT: v_or_b32_e32 v12, v4, v12 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v11 -; VI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc -; VI-NEXT: v_and_b32_e32 v12, 7, v10 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12 -; VI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5 +; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v5 +; VI-NEXT: v_or_b32_e32 v10, v12, v10 +; VI-NEXT: v_or_b32_e32 v11, v4, v11 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; VI-NEXT: v_and_b32_e32 v11, 7, v10 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 ; VI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 +; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v11, v11, v12 ; VI-NEXT: v_lshrrev_b32_e32 v10, 2, v10 -; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v12 -; VI-NEXT: v_mov_b32_e32 v12, 0x7c00 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v11 -; VI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc -; VI-NEXT: v_mov_b32_e32 v13, 0x7e00 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v11 +; VI-NEXT: v_mov_b32_e32 v11, 0x7c00 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; VI-NEXT: v_mov_b32_e32 v12, 0x7e00 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; VI-NEXT: s_movk_i32 s6, 0x40f -; VI-NEXT: v_cndmask_b32_e32 v4, v12, v13, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v11 +; VI-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5 ; VI-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; VI-NEXT: v_mov_b32_e32 v10, 0x8000 -; VI-NEXT: v_and_b32_e32 v11, 0x1ff, v7 -; VI-NEXT: v_and_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v11, v6 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: v_and_b32_e32 v10, 0x1ff, v7 +; VI-NEXT: v_or_b32_e32 v6, v10, v6 ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v7 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5 ; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-NEXT: v_bfe_u32 v7, v7, 20, 11 ; VI-NEXT: v_or_b32_e32 v5, v5, v6 -; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v7 +; VI-NEXT: v_sub_u32_e32 v10, vcc, s4, v7 ; VI-NEXT: v_or_b32_e32 v6, 0x1000, v5 -; VI-NEXT: v_med3_i32 v11, v11, 0, 13 -; VI-NEXT: v_lshrrev_b32_e32 v14, v11, v6 -; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v14 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v6 +; VI-NEXT: v_med3_i32 v10, v10, 0, 13 +; VI-NEXT: v_lshrrev_b32_e32 v13, v10, v6 +; VI-NEXT: v_lshlrev_b32_e32 v10, v10, v13 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6 ; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; VI-NEXT: v_add_u32_e32 v7, vcc, s5, v7 -; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v7 -; VI-NEXT: v_or_b32_e32 v6, v14, v6 -; VI-NEXT: v_or_b32_e32 v11, v5, v11 +; VI-NEXT: v_lshlrev_b32_e32 v10, 12, v7 +; VI-NEXT: v_or_b32_e32 v6, v13, v6 +; VI-NEXT: v_or_b32_e32 v10, v5, v10 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc -; VI-NEXT: v_and_b32_e32 v11, 7, v6 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; VI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v11, v11, v14 +; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; VI-NEXT: v_and_b32_e32 v10, 7, v6 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10 +; VI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v10, v10, v13 ; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v11 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v10 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; VI-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; VI-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc +; VI-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; VI-NEXT: v_and_b32_e32 v7, 0x1ff, v1 ; VI-NEXT: v_or_b32_e32 v0, v7, v0 @@ -6139,39 +6129,37 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_and_b32_e32 v6, 0xffe, v6 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-NEXT: v_bfe_u32 v7, v1, 20, 11 +; VI-NEXT: v_bfe_u32 v1, v1, 20, 11 ; VI-NEXT: v_or_b32_e32 v0, v6, v0 -; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v7 +; VI-NEXT: v_sub_u32_e32 v7, vcc, s4, v1 ; VI-NEXT: v_or_b32_e32 v6, 0x1000, v0 -; VI-NEXT: v_med3_i32 v11, v11, 0, 13 -; VI-NEXT: v_lshrrev_b32_e32 v14, v11, v6 -; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v14 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v6 +; VI-NEXT: v_med3_i32 v7, v7, 0, 13 +; VI-NEXT: v_lshrrev_b32_e32 v10, v7, v6 +; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v10 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v6 ; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v7, vcc, s5, v7 -; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v7 -; VI-NEXT: v_or_b32_e32 v6, v14, v6 -; VI-NEXT: v_or_b32_e32 v11, v0, v11 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7 -; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc -; VI-NEXT: v_and_b32_e32 v11, 7, v6 -; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11 -; VI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11 -; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; VI-NEXT: v_or_b32_e32 v11, v11, v14 +; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1 +; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v1 +; VI-NEXT: v_or_b32_e32 v6, v10, v6 +; VI-NEXT: v_or_b32_e32 v7, v0, v7 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc +; VI-NEXT: v_and_b32_e32 v7, 7, v6 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7 +; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7 +; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; VI-NEXT: v_or_b32_e32 v7, v7, v10 ; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6 -; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v11 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7 -; VI-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1 +; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; VI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; VI-NEXT: v_and_b32_e32 v6, 0x1ff, v3 -; VI-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v6, v2 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; VI-NEXT: v_and_b32_e32 v1, 0xffe, v1 @@ -6200,16 +6188,18 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, ; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3 -; VI-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v12, v13, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3 ; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_and_b32_e32 v2, 0x7fff, v4 ; VI-NEXT: s_mov_b32 s4, 0x7fff7fff -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: v_bfi_b32 v0, s4, v0, v8 ; VI-NEXT: v_bfi_b32 v1, s4, v1, v9 ; VI-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index 030c33285012..0684c3081983 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -452,7 +452,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_add_f32_e64 v0, |v0|, v1 +; CI-NEXT: v_sub_f32_e32 v0, v1, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_negk_negk_f16: @@ -462,7 +462,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; VI-NEXT: v_mov_b32_e32 v3, 0xc000 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: v_add_f16_e64 v0, |v0|, v1 +; VI-NEXT: v_sub_f16_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_f16: @@ -472,7 +472,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l ; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_negk_f16: @@ -482,7 +482,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v1 +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v1, v0 ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_f16: @@ -492,7 +492,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo -; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v1.l +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l ; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_negk_f16: @@ -502,7 +502,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v1 +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v1, v0 ; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2.0, half -1.0 diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll index a680ba593341..ec0455ab6e93 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -132,12 +132,11 @@ define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 { ret void } -; FIXME: fabs should fold away ; GCN-LABEL: {{^}}add_select_fabs_negk_negk_f32: ; GCN: buffer_load_dword [[X:v[0-9]+]] ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s -; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]] +; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]] define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 { %x = load volatile float, ptr addrspace(1) poison %cmp = icmp eq i32 %c, 0 diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index 7ed27f008083..4e07c724b8a8 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -776,20 +776,16 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; CI-LABEL: add_select_fabs_negk_negk_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| -; CI-NEXT: v_add_f32_e32 v0, v0, v2 -; CI-NEXT: v_add_f32_e32 v1, v1, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc +; CI-NEXT: v_sub_f32_e32 v1, v3, v1 +; CI-NEXT: v_sub_f32_e32 v0, v2, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: add_select_fabs_negk_negk_v2f16: @@ -801,8 +797,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; VI-NEXT: v_add_f16_sdwa v1, |v1|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_add_f16_e64 v0, |v0|, v2 +; VI-NEXT: v_sub_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -816,8 +812,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16: @@ -831,9 +826,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0 ; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_negk_v2f16: @@ -846,9 +839,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo ; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16: @@ -862,9 +854,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0 ; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l -; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_negk_v2f16: @@ -877,9 +867,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo ; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %select = select <2 x i1> %cmp, <2 x half> , <2 x half> diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 660ff4677547..04b98730c6a1 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -1515,7 +1515,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in ; EG-NEXT: TRUNC * T0.W, PV.W, ; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, ; EG-NEXT: TRUNC * T0.W, PV.W, -; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, ; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, ; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) @@ -1658,7 +1658,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; EG-NEXT: TRUNC * T0.W, PV.W, ; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, ; EG-NEXT: TRUNC * T0.W, PV.W, -; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y, ; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, ; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) @@ -1858,7 +1858,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i ; EG-NEXT: TRUNC * T0.W, PV.W, ; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z, ; EG-NEXT: TRUNC * T0.W, PV.W, -; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.X|, +; EG-NEXT: SETGE * T1.W, |PV.W|, T0.X, ; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, ; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, ; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)