Files
clang-p2996/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll
Roman Lebedev 3d7bf6625a [X86][Costmodel] Improve cost modelling for not-fully-interleaved load
While i've modelled most of the relevant tuples for AVX2,
that only covered fully-interleaved groups.

By definition, interleaving load of stride N means:
load N*VF elements, and shuffle them into N VF-sized vectors,
with 0'th vector containing elements `[0, VF)*stride + 0`,
and 1'th vector containing elements `[0, VF)*stride + 1`.
Example: https://godbolt.org/z/df561Me5E (i64 stride 4 vf 2 => cost 6)

Now, not fully interleaved load, is when not all of these vectors is demanded.
So at worst, we could just pretend that everything is demanded,
and discard the non-demanded vectors. What this means is that the cost
for not-fully-interleaved group should be not greater than the cost
for the same fully-interleaved group, but perhaps somewhat less.
Examples:
https://godbolt.org/z/a78dK5Geq (i64 stride 4 (indices 012u) vf 2 => cost 4)
https://godbolt.org/z/G91ceo8dM (i64 stride 4 (indices 01uu) vf 2 => cost 2)
https://godbolt.org/z/5joYob9rx (i64 stride 4 (indices 0uuu) vf 2 => cost 1)

As we have established over the course of last ~70 patches, (wow)
`BaseT::getInterleavedMemoryOpCos()` is absolutely bogus,
it is usually almost an order of magnitude overestimation,
so i would claim that we should at least use the hardcoded costs
of fully interleaved load groups.

We could go further and adjust them e.g. by the number of demanded indices,
but then i'm somewhat fearful of underestimating the cost.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D111174
2021-10-14 23:14:36 +03:00

58 lines
2.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize --force-vector-width=4 --force-vector-interleave=0 -S -o - < %s | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
%0 = type { i32 }
%1 = type { i64 }
define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 {
; CHECK-LABEL: @foo(
; CHECK: vector.body:
; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
;
entry:
br label %loop
loop:
%p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ]
%p.inc = getelementptr inbounds i64, i64* %p2, i64 4
%p3 = bitcast i64* %p2 to %0**
%v = load %0*, %0** %p3, align 8
%b = icmp eq i64* %p.inc, %p.last
br i1 %b, label %exit, label %loop
exit:
ret void
}
define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 {
; CHECK-LABEL: @bar(
; CHECK: vector.body:
; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
;
entry:
br label %loop
loop:
%p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ]
%p.inc = getelementptr inbounds i64, i64* %p2, i64 4
%p3 = bitcast i64* %p2 to %1**
%v = load %1*, %1** %p3, align 8
%b = icmp eq i64* %p.inc, %p.last
br i1 %b, label %exit, label %loop
exit:
ret void
}
attributes #0 = { "target-cpu"="skylake" }