Files
clang-p2996/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
Florian Hahn 68469a80cb [LV] Disable runtime unrolling for vectorized loops.
This patch adds metadata to disable runtime unrolling to the vectorized
loop. If runtime unrolling/interleaving is considered profitable, LV
will interleave the loop directly. There should be no need to perform
runtime unrolling at a later stage.

Note that we already add metadata to disable runtime unrolling to the
scalar loop after vectorization.

The additional unrolling unnecessarily increases code size and compile
time. In addition to that we have several bug reports of unncessary
runtime unrolling for vectorized loops, e.g. PR40961

Compile-time improvements:

  NewPM-O3: -1.04%
  NewPM-ReleaseThinLTO: -0.59%
  NewPM-ReleaseLTO-g: -0.97%

https://llvm-compile-time-tracker.com/compare.php?from=ce1be13a868d0f8afa367975558c1a6175cce33a&to=78bc2e67f22e9e10e61cdb6cdac4bb857d95eb1b&stat=instructions:u

Fixes #40306.

Reviewed By: lebedev.ri, nikic

Differential Revision: https://reviews.llvm.org/D115261
2023-01-06 10:56:17 +00:00

109 lines
5.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; Vectorization with dependence checks.
; Check that a non-power-of-2 MaxVF, calculated based on maximum safe distance,
; does not lead fold-tail to think that no tail will be generated for any chosen
; (power of 2) VF.
; Dependence distance here is 3 iterations.
; Tiny trip count of 15 divides 3, but any (even) VF will have a tail.
;unsigned char a [15+3];
;void maxvf3(){
; for (int j = 0; j < 15; ++j) {
; a[j] = 69;
; a[j+3] = 7;
; }
;}
@a = common local_unnamed_addr global [18 x i8] zeroinitializer, align 16
define void @maxvf3() {
; CHECK-LABEL: @maxvf3(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 14, i32 14>
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP2]]
; CHECK-NEXT: store i8 69, ptr [[TMP3]], align 8
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
; CHECK: pred.store.if1:
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP5]]
; CHECK-NEXT: store i8 69, ptr [[TMP6]], align 8
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
; CHECK: pred.store.continue2:
; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <2 x i32> <i32 3, i32 3>, [[VEC_IND]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
; CHECK: pred.store.if3:
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP9]]
; CHECK-NEXT: store i8 7, ptr [[TMP10]], align 8
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
; CHECK: pred.store.continue4:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.if5:
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[TMP12]]
; CHECK-NEXT: store i8 7, ptr [[TMP13]], align 8
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[J:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[AJ:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[J]]
; CHECK-NEXT: store i8 69, ptr [[AJ]], align 8
; CHECK-NEXT: [[JP3:%.*]] = add nuw nsw i32 3, [[J]]
; CHECK-NEXT: [[AJP3:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[JP3]]
; CHECK-NEXT: store i8 7, ptr [[AJP3]], align 8
; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i32 [[J]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
%aj = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 %j
store i8 69, ptr %aj, align 8
%jp3 = add nuw nsw i32 3, %j
%ajp3 = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 %jp3
store i8 7, ptr %ajp3, align 8
%j.next = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %j.next, 15
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
for.end:
ret void
}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.enable", i1 true}