[AArch64][TTI] Reduce cost for splatting whole first vector segment (SVE) (#145701)

Improve cost modeling for splatting the first 128b segment.
This commit is contained in:
Graham Hunter
2025-07-02 09:51:56 +01:00
committed by GitHub
parent a75587d271
commit 85bc868417
3 changed files with 73 additions and 4 deletions

View File

@@ -6750,6 +6750,21 @@ inline std::optional<unsigned> isDUPQMask(ArrayRef<int> Mask, unsigned Segments,
return std::nullopt;
}
/// isDUPFirstSegmentMask - matches a splat of the first 128b segment.
inline bool isDUPFirstSegmentMask(ArrayRef<int> Mask, unsigned Segments,
unsigned SegmentSize) {
// Make sure there's no size changes.
if (SegmentSize * Segments != Mask.size())
return false;
// Check that all lanes refer to the equivalent lane in the first segment.
// Undef/poison lanes (<0) are also accepted.
return all_of(enumerate(Mask), [&](auto P) {
const unsigned IndexWithinSegment = P.index() % SegmentSize;
return P.value() < 0 || unsigned(P.value()) == IndexWithinSegment;
});
}
} // namespace llvm
#endif

View File

@@ -5600,9 +5600,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
}
// Segmented shuffle matching.
if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc &&
isa<FixedVectorType>(SrcTy) && !Mask.empty() &&
if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
!Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
AArch64::SVEBitsPerBlock)) {
@@ -5612,7 +5611,14 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
unsigned SegmentElts = VTy->getNumElements() / Segments;
// dupq zd.t, zn.t[idx]
if (isDUPQMask(Mask, Segments, SegmentElts))
if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
ST->isSVEorStreamingSVEAvailable() &&
isDUPQMask(Mask, Segments, SegmentElts))
return LT.first;
// mov zd.q, vn
if (ST->isSVEorStreamingSVEAvailable() &&
isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
return LT.first;
}

View File

@@ -49,5 +49,53 @@ define void @dup_within_each_segment_512b() #1 {
ret void
}
define void @dup_whole_segment_256b() #0 {
; CHECK-LABEL: 'dup_whole_segment_256b'
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 1, i32 2, i32 3>
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3>
%dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1,
i32 0, i32 1, i32 0, i32 1>
%dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3,
i32 poison, i32 1, i32 2, i32 3>
ret void
}
define void @dup_whole_segment_512b() #1 {
; CHECK-LABEL: 'dup_whole_segment_512b'
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3, i32 poison, i32 1, i32 2, i32 3>
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
%dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3,
i32 0, i32 1, i32 2, i32 3>
%dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1,
i32 0, i32 1, i32 0, i32 1>
%dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 3,
i32 poison, i32 1, i32 2, i32 3>
ret void
}
attributes #0 = { noinline vscale_range(2,2) }
attributes #1 = { noinline vscale_range(4,4) }