Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll
Alexey Bataev 0d74fd3fdf [SLP][COST][X86]Improve cost model for masked gather.
Revived D101297 in its original form + added some changes in X86
legalization cehcking for masked gathers.

This solution is the most stable and the most correct one. We have to
check the legality before trying to build the masked gather in SLP.
Without this check we have incorrect cost (for SLP) in case if the masked gather
is not legal/slower than the gather. And we're missing some
vectorization opportunities.

This can be fixed in the cost model, but in this case we need to add
special checks for the cost of GEPs for ScatterVectorize node, add
special check for small trees, etc., i.e. there are a lot of corner
cases here and there, which insrease code base and make it harder to
maintain the code.

> Can't we rely on cost model to deal with this? This can be profitable for futher vectorization, when we can start from such gather loads as seed.

The question from D101297. Actually, no, it can't. Actually, simple
gather may give us better result, especially after we started
vectorization of insertelements. Plus, like I said before, the cost for
non-legal masked gathers leads to missed vectorization opportunities.

Differential Revision: https://reviews.llvm.org/D105042
2021-07-08 11:53:30 -07:00

189 lines
9.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -scoped-noalias-aa -slp-vectorizer -mtriple=x86_64-apple-darwin -enable-new-pm=false -S %s | FileCheck %s
define void @version_multiple(i32* nocapture %out_block, i32* nocapture readonly %counter) {
; CHECK-LABEL: @version_multiple(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[COUNTER:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OUT_BLOCK:%.*]], align 4
; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[TMP1]], [[TMP0]]
; CHECK-NEXT: store i32 [[XOR]], i32* [[OUT_BLOCK]], align 4
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 1
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_1]], align 4
; CHECK-NEXT: [[XOR_1:%.*]] = xor i32 [[TMP3]], [[TMP2]]
; CHECK-NEXT: store i32 [[XOR_1]], i32* [[ARRAYIDX2_1]], align 4
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 2
; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 2
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_2]], align 4
; CHECK-NEXT: [[XOR_2:%.*]] = xor i32 [[TMP5]], [[TMP4]]
; CHECK-NEXT: store i32 [[XOR_2]], i32* [[ARRAYIDX2_2]], align 4
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[COUNTER]], i64 3
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[OUT_BLOCK]], i64 3
; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX2_3]], align 4
; CHECK-NEXT: [[XOR_3:%.*]] = xor i32 [[TMP7]], [[TMP6]]
; CHECK-NEXT: store i32 [[XOR_3]], i32* [[ARRAYIDX2_3]], align 4
; CHECK-NEXT: ret void
;
entry:
%0 = load i32, i32* %counter, align 4
%1 = load i32, i32* %out_block, align 4
%xor = xor i32 %1, %0
store i32 %xor, i32* %out_block, align 4
%arrayidx.1 = getelementptr inbounds i32, i32* %counter, i64 1
%2 = load i32, i32* %arrayidx.1, align 4
%arrayidx2.1 = getelementptr inbounds i32, i32* %out_block, i64 1
%3 = load i32, i32* %arrayidx2.1, align 4
%xor.1 = xor i32 %3, %2
store i32 %xor.1, i32* %arrayidx2.1, align 4
%arrayidx.2 = getelementptr inbounds i32, i32* %counter, i64 2
%4 = load i32, i32* %arrayidx.2, align 4
%arrayidx2.2 = getelementptr inbounds i32, i32* %out_block, i64 2
%5 = load i32, i32* %arrayidx2.2, align 4
%xor.2 = xor i32 %5, %4
store i32 %xor.2, i32* %arrayidx2.2, align 4
%arrayidx.3 = getelementptr inbounds i32, i32* %counter, i64 3
%6 = load i32, i32* %arrayidx.3, align 4
%arrayidx2.3 = getelementptr inbounds i32, i32* %out_block, i64 3
%7 = load i32, i32* %arrayidx2.3, align 4
%xor.3 = xor i32 %7, %6
store i32 %xor.3, i32* %arrayidx2.3, align 4
ret void
}
declare void @use(<8 x float>)
define void @delete_pointer_bound(float* %a, float* %b, i1 %c) #0 {
; CHECK-LABEL: @delete_pointer_bound(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[B_10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 10
; CHECK-NEXT: [[B_14:%.*]] = getelementptr inbounds float, float* [[B]], i64 14
; CHECK-NEXT: br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
; CHECK: else:
; CHECK-NEXT: [[L0:%.*]] = load float, float* [[B_10]], align 4
; CHECK-NEXT: [[L1:%.*]] = load float, float* [[B_14]], align 4
; CHECK-NEXT: [[I2:%.*]] = insertelement <8 x float> undef, float [[L0]], i32 2
; CHECK-NEXT: [[I3:%.*]] = insertelement <8 x float> [[I2]], float [[L0]], i32 3
; CHECK-NEXT: [[I4:%.*]] = insertelement <8 x float> [[I3]], float [[L1]], i32 4
; CHECK-NEXT: [[I7:%.*]] = insertelement <8 x float> [[I4]], float [[L1]], i32 7
; CHECK-NEXT: call void @use(<8 x float> [[I7]])
; CHECK-NEXT: ret void
; CHECK: then:
; CHECK-NEXT: [[A_8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 8
; CHECK-NEXT: store float 0.000000e+00, float* [[A_8]], align 4
; CHECK-NEXT: [[L6:%.*]] = load float, float* [[B_14]], align 4
; CHECK-NEXT: [[A_5:%.*]] = getelementptr inbounds float, float* [[A]], i64 5
; CHECK-NEXT: store float [[L6]], float* [[A_5]], align 4
; CHECK-NEXT: [[A_6:%.*]] = getelementptr inbounds float, float* [[A]], i64 6
; CHECK-NEXT: store float 0.000000e+00, float* [[A_6]], align 4
; CHECK-NEXT: [[A_7:%.*]] = getelementptr inbounds float, float* [[A]], i64 7
; CHECK-NEXT: store float 0.000000e+00, float* [[A_7]], align 4
; CHECK-NEXT: ret void
;
entry:
%b.10 = getelementptr inbounds float, float* %b, i64 10
%b.14 = getelementptr inbounds float, float* %b, i64 14
br i1 %c, label %then, label %else
else:
%l0 = load float, float* %b.10, align 4
%l1 = load float, float* %b.14, align 4
%i2 = insertelement <8 x float> undef, float %l0, i32 2
%i3 = insertelement <8 x float> %i2, float %l0, i32 3
%i4 = insertelement <8 x float> %i3, float %l1, i32 4
%i7 = insertelement <8 x float> %i4, float %l1, i32 7
call void @use(<8 x float> %i7)
ret void
then:
%a.8 = getelementptr inbounds float, float* %a, i64 8
store float 0.0, float* %a.8, align 4
%l6 = load float, float* %b.14, align 4
%a.5 = getelementptr inbounds float, float* %a, i64 5
store float %l6, float* %a.5, align 4
%a.6 = getelementptr inbounds float, float* %a, i64 6
store float 0.0, float* %a.6, align 4
%a.7 = getelementptr inbounds float, float* %a, i64 7
store float 0.0, float* %a.7, align 4
ret void
}
%struct.zot = type { i16, i16, i16, i32, float, float, float, %struct.quux*, %struct.zot*, %struct.wombat*, %struct.wombat.0 }
%struct.quux = type { i16, %struct.quux*, %struct.quux* }
%struct.wombat = type { i32, i16, i8, i8, %struct.eggs* }
%struct.eggs = type { float, i8, %struct.ham }
%struct.ham = type { [2 x double], [8 x i8] }
%struct.wombat.0 = type { %struct.bar }
%struct.bar = type { [3 x double], [3 x double], double, double, i16, [3 x double]*, i32, [3 x double] }
define double @preserve_loop_info(%struct.zot* %arg) {
; CHECK-LABEL: @preserve_loop_info(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP:%.*]] = alloca [3 x double], align 16
; CHECK-NEXT: br label [[OUTER_HEADER:%.*]]
; CHECK: outer.header:
; CHECK-NEXT: br label [[INNER:%.*]]
; CHECK: inner:
; CHECK-NEXT: br i1 undef, label [[OUTER_LATCH:%.*]], label [[INNER]]
; CHECK: outer.latch:
; CHECK-NEXT: br i1 undef, label [[BB:%.*]], label [[OUTER_HEADER]]
; CHECK: bb:
; CHECK-NEXT: [[TMP5:%.*]] = load [3 x double]*, [3 x double]** undef, align 8
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP]], i64 0, i64 0
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP]], i64 0, i64 1
; CHECK-NEXT: br label [[LOOP_3HEADER:%.*]]
; CHECK: loop.3header:
; CHECK-NEXT: br i1 undef, label [[LOOP_3LATCH:%.*]], label [[BB9:%.*]]
; CHECK: bb9:
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[TMP5]], i64 undef, i64 1
; CHECK-NEXT: store double undef, double* [[TMP6]], align 16
; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
; CHECK-NEXT: store double [[TMP12]], double* [[TMP7]], align 8
; CHECK-NEXT: br label [[LOOP_3LATCH]]
; CHECK: loop.3latch:
; CHECK-NEXT: br i1 undef, label [[BB14:%.*]], label [[LOOP_3HEADER]]
; CHECK: bb14:
; CHECK-NEXT: [[TMP15:%.*]] = call double undef(double* [[TMP6]], %struct.zot* [[ARG:%.*]])
; CHECK-NEXT: ret double undef
;
entry:
%tmp = alloca [3 x double], align 16
br label %outer.header
outer.header: ; preds = %bb3, %bb
br label %inner
inner:
br i1 undef, label %outer.latch, label %inner
outer.latch: ; preds = %bb16
br i1 undef, label %bb, label %outer.header
bb: ; preds = %bb3
%tmp5 = load [3 x double]*, [3 x double]** undef, align 8
%tmp6 = getelementptr inbounds [3 x double], [3 x double]* %tmp, i64 0, i64 0
%tmp7 = getelementptr inbounds [3 x double], [3 x double]* %tmp, i64 0, i64 1
br label %loop.3header
loop.3header: ; preds = %bb13, %bb4
br i1 undef, label %loop.3latch, label %bb9
bb9: ; preds = %bb8
%tmp10 = getelementptr inbounds [3 x double], [3 x double]* %tmp5, i64 undef, i64 1
store double undef, double* %tmp6, align 16
%tmp12 = load double, double* %tmp10, align 8
store double %tmp12, double* %tmp7, align 8
br label %loop.3latch
loop.3latch: ; preds = %bb11, %bb8
br i1 undef, label %bb14, label %loop.3header
bb14: ; preds = %bb13
%tmp15 = call double undef(double* %tmp6, %struct.zot* %arg)
ret double undef
}
attributes #0 = { "target-features"="+avx2" }