While i've modelled most of the relevant tuples for AVX2, that only covered fully-interleaved groups. By definition, interleaving load of stride N means: load N*VF elements, and shuffle them into N VF-sized vectors, with 0'th vector containing elements `[0, VF)*stride + 0`, and 1'th vector containing elements `[0, VF)*stride + 1`. Example: https://godbolt.org/z/df561Me5E (i64 stride 4 vf 2 => cost 6) Now, not fully interleaved load, is when not all of these vectors is demanded. So at worst, we could just pretend that everything is demanded, and discard the non-demanded vectors. What this means is that the cost for not-fully-interleaved group should be not greater than the cost for the same fully-interleaved group, but perhaps somewhat less. Examples: https://godbolt.org/z/a78dK5Geq (i64 stride 4 (indices 012u) vf 2 => cost 4) https://godbolt.org/z/G91ceo8dM (i64 stride 4 (indices 01uu) vf 2 => cost 2) https://godbolt.org/z/5joYob9rx (i64 stride 4 (indices 0uuu) vf 2 => cost 1) As we have established over the course of last ~70 patches, (wow) `BaseT::getInterleavedMemoryOpCos()` is absolutely bogus, it is usually almost an order of magnitude overestimation, so i would claim that we should at least use the hardcoded costs of fully interleaved load groups. We could go further and adjust them e.g. by the number of demanded indices, but then i'm somewhat fearful of underestimating the cost. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D111174
58 lines
2.7 KiB
LLVM
58 lines
2.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -loop-vectorize --force-vector-width=4 --force-vector-interleave=0 -S -o - < %s | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
|
target triple = "x86_64-unknown-linux-gnu"
|
|
|
|
%0 = type { i32 }
|
|
%1 = type { i64 }
|
|
|
|
define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 {
|
|
; CHECK-LABEL: @foo(
|
|
; CHECK: vector.body:
|
|
; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
|
|
; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
|
|
; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
|
|
; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ]
|
|
%p.inc = getelementptr inbounds i64, i64* %p2, i64 4
|
|
%p3 = bitcast i64* %p2 to %0**
|
|
%v = load %0*, %0** %p3, align 8
|
|
%b = icmp eq i64* %p.inc, %p.last
|
|
br i1 %b, label %exit, label %loop
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 {
|
|
; CHECK-LABEL: @bar(
|
|
; CHECK: vector.body:
|
|
; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
|
|
; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
|
|
; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
|
|
; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ]
|
|
%p.inc = getelementptr inbounds i64, i64* %p2, i64 4
|
|
%p3 = bitcast i64* %p2 to %1**
|
|
%v = load %1*, %1** %p3, align 8
|
|
%b = icmp eq i64* %p.inc, %p.last
|
|
br i1 %b, label %exit, label %loop
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { "target-cpu"="skylake" }
|
|
|