Files
clang-p2996/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
David Green b65267ca7b [LV] Invalidate widening decisions after maximizing vector bandwidth
When MaximizeVectorBandwidth is enabled, we can end up (via calls to
collectUniformsAndScalars/setCostBasedWideningDecision through
calculateRegisterUsage) making widening decisions before we have decided
whether to fold the tail by masking. These decisions will be wrong if we
later decided to fold the tail, for example when the trip count is very
low. It will use incorrect costs for loads that should get masked, using
standard memory operation costs instead.

This still at the moment uses the EmulatedMaskMemRefHack costs (a bit
unfortunately), but the old costs without this change were 1, leading to
too optimistic vectorization.

This slightly changes the way that the MaximizeVectorBandwidth option
works to make it easier to test, always honouring the option if it is
set.

Differential Revision: https://reviews.llvm.org/D120215
2022-03-31 09:19:31 +01:00

130 lines
7.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -vectorizer-maximize-bandwidth -S 2>&1 | FileCheck %s
; RUN: opt < %s -loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 -disable-output | FileCheck %s --check-prefix=COST
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-eabi"
; Check that the maximize vector bandwidth option does not give incorrect costs
; due to invalid cost decisions. The loop below has a low maximum trip count,
; so will be masked.
; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load
; COST: LV: Selecting VF: 1.
define i32 @test(i8* nocapture noundef readonly %pInVec, i8* nocapture noundef readonly %pInA1, i8* nocapture noundef readonly %pInA2, i8* nocapture noundef readonly %pInA3, i8* nocapture noundef readonly %pInA4, i32 noundef %numCols) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[AND:%.*]] = and i32 [[NUMCOLS:%.*]], 3
; CHECK-NEXT: [[CMP_NOT32:%.*]] = icmp eq i32 [[AND]], 0
; CHECK-NEXT: br i1 [[CMP_NOT32]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
; CHECK: while.body.preheader:
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; CHECK: while.body:
; CHECK-NEXT: [[PINVEC_ADDR_042:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PINVEC:%.*]], [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SUM4_041:%.*]] = phi i32 [ [[ADD14:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SUM3_040:%.*]] = phi i32 [ [[ADD10:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SUM2_039:%.*]] = phi i32 [ [[ADD6:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[SUM1_038:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[COLCNT_037:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[AND]], [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[PINA1_ADDR_036:%.*]] = phi i8* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PINA1:%.*]], [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[PINA4_ADDR_035:%.*]] = phi i8* [ [[INCDEC_PTR11:%.*]], [[WHILE_BODY]] ], [ [[PINA4:%.*]], [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[PINA3_ADDR_034:%.*]] = phi i8* [ [[INCDEC_PTR7:%.*]], [[WHILE_BODY]] ], [ [[PINA3:%.*]], [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[PINA2_ADDR_033:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[PINA2:%.*]], [[WHILE_BODY_PREHEADER]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PINVEC_ADDR_042]], i64 1
; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[PINVEC_ADDR_042]], align 1
; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32
; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i8, i8* [[PINA1_ADDR_036]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[PINA1_ADDR_036]], align 1
; CHECK-NEXT: [[CONV2:%.*]] = sext i8 [[TMP1]] to i32
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
; CHECK-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[SUM1_038]]
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PINA2_ADDR_033]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[PINA2_ADDR_033]], align 1
; CHECK-NEXT: [[CONV4:%.*]] = sext i8 [[TMP2]] to i32
; CHECK-NEXT: [[MUL5:%.*]] = mul nsw i32 [[CONV4]], [[CONV]]
; CHECK-NEXT: [[ADD6]] = add nsw i32 [[MUL5]], [[SUM2_039]]
; CHECK-NEXT: [[INCDEC_PTR7]] = getelementptr inbounds i8, i8* [[PINA3_ADDR_034]], i64 1
; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[PINA3_ADDR_034]], align 1
; CHECK-NEXT: [[CONV8:%.*]] = sext i8 [[TMP3]] to i32
; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV8]], [[CONV]]
; CHECK-NEXT: [[ADD10]] = add nsw i32 [[MUL9]], [[SUM3_040]]
; CHECK-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i8, i8* [[PINA4_ADDR_035]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[PINA4_ADDR_035]], align 1
; CHECK-NEXT: [[CONV12:%.*]] = sext i8 [[TMP4]] to i32
; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[CONV12]], [[CONV]]
; CHECK-NEXT: [[ADD14]] = add nsw i32 [[MUL13]], [[SUM4_041]]
; CHECK-NEXT: [[DEC]] = add nsw i32 [[COLCNT_037]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
; CHECK: while.end.loopexit:
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
; CHECK-NEXT: [[ADD6_LCSSA:%.*]] = phi i32 [ [[ADD6]], [[WHILE_BODY]] ]
; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi i32 [ [[ADD10]], [[WHILE_BODY]] ]
; CHECK-NEXT: [[ADD14_LCSSA:%.*]] = phi i32 [ [[ADD14]], [[WHILE_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[ADD6_LCSSA]], [[ADD_LCSSA]]
; CHECK-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP5]], [[ADD10_LCSSA]]
; CHECK-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP6]], [[ADD14_LCSSA]]
; CHECK-NEXT: br label [[WHILE_END]]
; CHECK: while.end:
; CHECK-NEXT: [[ADD17:%.*]] = phi i32 [ [[TMP7]], [[WHILE_END_LOOPEXIT]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: ret i32 [[ADD17]]
;
entry:
%and = and i32 %numCols, 3
%cmp.not32 = icmp eq i32 %and, 0
br i1 %cmp.not32, label %while.end, label %while.body
while.body: ; preds = %entry, %while.body
%pInVec.addr.042 = phi i8* [ %incdec.ptr, %while.body ], [ %pInVec, %entry ]
%sum4.041 = phi i32 [ %add14, %while.body ], [ 0, %entry ]
%sum3.040 = phi i32 [ %add10, %while.body ], [ 0, %entry ]
%sum2.039 = phi i32 [ %add6, %while.body ], [ 0, %entry ]
%sum1.038 = phi i32 [ %add, %while.body ], [ 0, %entry ]
%colCnt.037 = phi i32 [ %dec, %while.body ], [ %and, %entry ]
%pInA1.addr.036 = phi i8* [ %incdec.ptr1, %while.body ], [ %pInA1, %entry ]
%pInA4.addr.035 = phi i8* [ %incdec.ptr11, %while.body ], [ %pInA4, %entry ]
%pInA3.addr.034 = phi i8* [ %incdec.ptr7, %while.body ], [ %pInA3, %entry ]
%pInA2.addr.033 = phi i8* [ %incdec.ptr3, %while.body ], [ %pInA2, %entry ]
%incdec.ptr = getelementptr inbounds i8, i8* %pInVec.addr.042, i64 1
%0 = load i8, i8* %pInVec.addr.042, align 1
%conv = sext i8 %0 to i32
%incdec.ptr1 = getelementptr inbounds i8, i8* %pInA1.addr.036, i64 1
%1 = load i8, i8* %pInA1.addr.036, align 1
%conv2 = sext i8 %1 to i32
%mul = mul nsw i32 %conv2, %conv
%add = add nsw i32 %mul, %sum1.038
%incdec.ptr3 = getelementptr inbounds i8, i8* %pInA2.addr.033, i64 1
%2 = load i8, i8* %pInA2.addr.033, align 1
%conv4 = sext i8 %2 to i32
%mul5 = mul nsw i32 %conv4, %conv
%add6 = add nsw i32 %mul5, %sum2.039
%incdec.ptr7 = getelementptr inbounds i8, i8* %pInA3.addr.034, i64 1
%3 = load i8, i8* %pInA3.addr.034, align 1
%conv8 = sext i8 %3 to i32
%mul9 = mul nsw i32 %conv8, %conv
%add10 = add nsw i32 %mul9, %sum3.040
%incdec.ptr11 = getelementptr inbounds i8, i8* %pInA4.addr.035, i64 1
%4 = load i8, i8* %pInA4.addr.035, align 1
%conv12 = sext i8 %4 to i32
%mul13 = mul nsw i32 %conv12, %conv
%add14 = add nsw i32 %mul13, %sum4.041
%dec = add nsw i32 %colCnt.037, -1
%cmp.not = icmp eq i32 %dec, 0
br i1 %cmp.not, label %while.end.loopexit, label %while.body
while.end.loopexit: ; preds = %while.body
%5 = add nsw i32 %add6, %add
%6 = add nsw i32 %5, %add10
%7 = add nsw i32 %6, %add14
br label %while.end
while.end: ; preds = %while.end.loopexit, %entry
%add17 = phi i32 [ %7, %while.end.loopexit ], [ 0, %entry ]
ret i32 %add17
}