When MaximizeVectorBandwidth is enabled, we can end up (via calls to collectUniformsAndScalars/setCostBasedWideningDecision through calculateRegisterUsage) making widening decisions before we have decided whether to fold the tail by masking. These decisions will be wrong if we later decided to fold the tail, for example when the trip count is very low. It will use incorrect costs for loads that should get masked, using standard memory operation costs instead. This still at the moment uses the EmulatedMaskMemRefHack costs (a bit unfortunately), but the old costs without this change were 1, leading to too optimistic vectorization. This slightly changes the way that the MaximizeVectorBandwidth option works to make it easier to test, always honouring the option if it is set. Differential Revision: https://reviews.llvm.org/D120215
130 lines
7.7 KiB
LLVM
130 lines
7.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; REQUIRES: asserts
|
|
; RUN: opt < %s -loop-vectorize -vectorizer-maximize-bandwidth -S 2>&1 | FileCheck %s
|
|
; RUN: opt < %s -loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 -disable-output | FileCheck %s --check-prefix=COST
|
|
|
|
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
|
target triple = "aarch64-none-unknown-eabi"
|
|
|
|
; Check that the maximize vector bandwidth option does not give incorrect costs
|
|
; due to invalid cost decisions. The loop below has a low maximum trip count,
|
|
; so will be masked.
|
|
|
|
; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load
|
|
; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load
|
|
; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load
|
|
; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load
|
|
; COST: LV: Selecting VF: 1.
|
|
|
|
define i32 @test(i8* nocapture noundef readonly %pInVec, i8* nocapture noundef readonly %pInA1, i8* nocapture noundef readonly %pInA2, i8* nocapture noundef readonly %pInA3, i8* nocapture noundef readonly %pInA4, i32 noundef %numCols) {
|
|
; CHECK-LABEL: @test(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[AND:%.*]] = and i32 [[NUMCOLS:%.*]], 3
|
|
; CHECK-NEXT: [[CMP_NOT32:%.*]] = icmp eq i32 [[AND]], 0
|
|
; CHECK-NEXT: br i1 [[CMP_NOT32]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
|
|
; CHECK: while.body.preheader:
|
|
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
|
|
; CHECK: while.body:
|
|
; CHECK-NEXT: [[PINVEC_ADDR_042:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PINVEC:%.*]], [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[SUM4_041:%.*]] = phi i32 [ [[ADD14:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[SUM3_040:%.*]] = phi i32 [ [[ADD10:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[SUM2_039:%.*]] = phi i32 [ [[ADD6:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[SUM1_038:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[COLCNT_037:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[AND]], [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[PINA1_ADDR_036:%.*]] = phi i8* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PINA1:%.*]], [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[PINA4_ADDR_035:%.*]] = phi i8* [ [[INCDEC_PTR11:%.*]], [[WHILE_BODY]] ], [ [[PINA4:%.*]], [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[PINA3_ADDR_034:%.*]] = phi i8* [ [[INCDEC_PTR7:%.*]], [[WHILE_BODY]] ], [ [[PINA3:%.*]], [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[PINA2_ADDR_033:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[PINA2:%.*]], [[WHILE_BODY_PREHEADER]] ]
|
|
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PINVEC_ADDR_042]], i64 1
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[PINVEC_ADDR_042]], align 1
|
|
; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32
|
|
; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i8, i8* [[PINA1_ADDR_036]], i64 1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[PINA1_ADDR_036]], align 1
|
|
; CHECK-NEXT: [[CONV2:%.*]] = sext i8 [[TMP1]] to i32
|
|
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
|
|
; CHECK-NEXT: [[ADD]] = add nsw i32 [[MUL]], [[SUM1_038]]
|
|
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PINA2_ADDR_033]], i64 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[PINA2_ADDR_033]], align 1
|
|
; CHECK-NEXT: [[CONV4:%.*]] = sext i8 [[TMP2]] to i32
|
|
; CHECK-NEXT: [[MUL5:%.*]] = mul nsw i32 [[CONV4]], [[CONV]]
|
|
; CHECK-NEXT: [[ADD6]] = add nsw i32 [[MUL5]], [[SUM2_039]]
|
|
; CHECK-NEXT: [[INCDEC_PTR7]] = getelementptr inbounds i8, i8* [[PINA3_ADDR_034]], i64 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[PINA3_ADDR_034]], align 1
|
|
; CHECK-NEXT: [[CONV8:%.*]] = sext i8 [[TMP3]] to i32
|
|
; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[CONV8]], [[CONV]]
|
|
; CHECK-NEXT: [[ADD10]] = add nsw i32 [[MUL9]], [[SUM3_040]]
|
|
; CHECK-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i8, i8* [[PINA4_ADDR_035]], i64 1
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[PINA4_ADDR_035]], align 1
|
|
; CHECK-NEXT: [[CONV12:%.*]] = sext i8 [[TMP4]] to i32
|
|
; CHECK-NEXT: [[MUL13:%.*]] = mul nsw i32 [[CONV12]], [[CONV]]
|
|
; CHECK-NEXT: [[ADD14]] = add nsw i32 [[MUL13]], [[SUM4_041]]
|
|
; CHECK-NEXT: [[DEC]] = add nsw i32 [[COLCNT_037]], -1
|
|
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
|
|
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
|
|
; CHECK: while.end.loopexit:
|
|
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
|
|
; CHECK-NEXT: [[ADD6_LCSSA:%.*]] = phi i32 [ [[ADD6]], [[WHILE_BODY]] ]
|
|
; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi i32 [ [[ADD10]], [[WHILE_BODY]] ]
|
|
; CHECK-NEXT: [[ADD14_LCSSA:%.*]] = phi i32 [ [[ADD14]], [[WHILE_BODY]] ]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 [[ADD6_LCSSA]], [[ADD_LCSSA]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = add nsw i32 [[TMP5]], [[ADD10_LCSSA]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = add nsw i32 [[TMP6]], [[ADD14_LCSSA]]
|
|
; CHECK-NEXT: br label [[WHILE_END]]
|
|
; CHECK: while.end:
|
|
; CHECK-NEXT: [[ADD17:%.*]] = phi i32 [ [[TMP7]], [[WHILE_END_LOOPEXIT]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: ret i32 [[ADD17]]
|
|
;
|
|
entry:
|
|
%and = and i32 %numCols, 3
|
|
%cmp.not32 = icmp eq i32 %and, 0
|
|
br i1 %cmp.not32, label %while.end, label %while.body
|
|
|
|
while.body: ; preds = %entry, %while.body
|
|
%pInVec.addr.042 = phi i8* [ %incdec.ptr, %while.body ], [ %pInVec, %entry ]
|
|
%sum4.041 = phi i32 [ %add14, %while.body ], [ 0, %entry ]
|
|
%sum3.040 = phi i32 [ %add10, %while.body ], [ 0, %entry ]
|
|
%sum2.039 = phi i32 [ %add6, %while.body ], [ 0, %entry ]
|
|
%sum1.038 = phi i32 [ %add, %while.body ], [ 0, %entry ]
|
|
%colCnt.037 = phi i32 [ %dec, %while.body ], [ %and, %entry ]
|
|
%pInA1.addr.036 = phi i8* [ %incdec.ptr1, %while.body ], [ %pInA1, %entry ]
|
|
%pInA4.addr.035 = phi i8* [ %incdec.ptr11, %while.body ], [ %pInA4, %entry ]
|
|
%pInA3.addr.034 = phi i8* [ %incdec.ptr7, %while.body ], [ %pInA3, %entry ]
|
|
%pInA2.addr.033 = phi i8* [ %incdec.ptr3, %while.body ], [ %pInA2, %entry ]
|
|
%incdec.ptr = getelementptr inbounds i8, i8* %pInVec.addr.042, i64 1
|
|
%0 = load i8, i8* %pInVec.addr.042, align 1
|
|
%conv = sext i8 %0 to i32
|
|
%incdec.ptr1 = getelementptr inbounds i8, i8* %pInA1.addr.036, i64 1
|
|
%1 = load i8, i8* %pInA1.addr.036, align 1
|
|
%conv2 = sext i8 %1 to i32
|
|
%mul = mul nsw i32 %conv2, %conv
|
|
%add = add nsw i32 %mul, %sum1.038
|
|
%incdec.ptr3 = getelementptr inbounds i8, i8* %pInA2.addr.033, i64 1
|
|
%2 = load i8, i8* %pInA2.addr.033, align 1
|
|
%conv4 = sext i8 %2 to i32
|
|
%mul5 = mul nsw i32 %conv4, %conv
|
|
%add6 = add nsw i32 %mul5, %sum2.039
|
|
%incdec.ptr7 = getelementptr inbounds i8, i8* %pInA3.addr.034, i64 1
|
|
%3 = load i8, i8* %pInA3.addr.034, align 1
|
|
%conv8 = sext i8 %3 to i32
|
|
%mul9 = mul nsw i32 %conv8, %conv
|
|
%add10 = add nsw i32 %mul9, %sum3.040
|
|
%incdec.ptr11 = getelementptr inbounds i8, i8* %pInA4.addr.035, i64 1
|
|
%4 = load i8, i8* %pInA4.addr.035, align 1
|
|
%conv12 = sext i8 %4 to i32
|
|
%mul13 = mul nsw i32 %conv12, %conv
|
|
%add14 = add nsw i32 %mul13, %sum4.041
|
|
%dec = add nsw i32 %colCnt.037, -1
|
|
%cmp.not = icmp eq i32 %dec, 0
|
|
br i1 %cmp.not, label %while.end.loopexit, label %while.body
|
|
|
|
while.end.loopexit: ; preds = %while.body
|
|
%5 = add nsw i32 %add6, %add
|
|
%6 = add nsw i32 %5, %add10
|
|
%7 = add nsw i32 %6, %add14
|
|
br label %while.end
|
|
|
|
while.end: ; preds = %while.end.loopexit, %entry
|
|
%add17 = phi i32 [ %7, %while.end.loopexit ], [ 0, %entry ]
|
|
ret i32 %add17
|
|
}
|