Addition of this pass has been botched. There is no particular reason why it had to be sold as an inseparable part of new-pm transition. It was added when old-pm was still the default, and very *very* few users were actually tracking new-pm, so it's effects weren't measured. Which means, some of the turnoil of the new-pm transition are actually likely regressions due to this pass. Likewise, there has been a number of post-commit feedback (post new-pm switch), namely * https://reviews.llvm.org/D37467#2787157 (regresses HW-loops) * https://reviews.llvm.org/D37467#2787259 (should not be in middle-end, should run after LSR, not before) * https://reviews.llvm.org/D95789 (an attempt to fix bad loop backedge metadata) and in the half year past, the pass authors (google) still haven't found time to respond to any of that. Hereby it is proposed to backout the pass from the pipeline, until someone who cares about it can address the issues reported, and properly start the process of adding a new pass into the pipeline, with proper performance evaluation. Furthermore, neither google nor facebook reports any perf changes from this change, so i'm dropping the pass completely. It can always be re-reverted should/if anyone want to pick it up again. Reviewed By: aeubanks Differential Revision: https://reviews.llvm.org/D104099
218 lines
9.5 KiB
LLVM
218 lines
9.5 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -O3 -rotation-max-header-size=0 -S -enable-new-pm=0 < %s | FileCheck %s --check-prefix=HOIST
|
|
; RUN: opt -passes='default<O3>' -rotation-max-header-size=0 -S < %s | FileCheck %s --check-prefix=HOIST
|
|
|
|
; RUN: opt -O3 -rotation-max-header-size=1 -S -enable-new-pm=0 < %s | FileCheck %s --check-prefix=HOIST
|
|
; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefix=HOIST
|
|
|
|
; RUN: opt -O3 -rotation-max-header-size=2 -S -enable-new-pm=0 < %s | FileCheck %s --check-prefix=ROTATED_LATER_OLDPM
|
|
; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefix=ROTATED_LATER_NEWPM
|
|
|
|
; RUN: opt -O3 -rotation-max-header-size=3 -S -enable-new-pm=0 < %s | FileCheck %s --check-prefix=ROTATE_OLDPM
|
|
; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefix=ROTATE_NEWPM
|
|
|
|
; This example is produced from a very basic C code:
|
|
;
|
|
; void f0();
|
|
; void f1();
|
|
; void f2();
|
|
;
|
|
; void loop(int width) {
|
|
; if(width < 1)
|
|
; return;
|
|
; for(int i = 0; i < width - 1; ++i) {
|
|
; f0();
|
|
; f1();
|
|
; }
|
|
; f0();
|
|
; f2();
|
|
; }
|
|
|
|
; We have a choice here. We can either
|
|
; * hoist the f0() call into loop header,
|
|
; * which potentially makes loop rotation unprofitable since loop header might
|
|
; have grown above certain threshold, and such unrotated loops will be
|
|
; ignored by LoopVectorizer, preventing vectorization
|
|
; * or loop rotation will succeed, resulting in some weird PHIs that will also
|
|
; harm vectorization
|
|
; * or not hoist f0() call before performing loop rotation,
|
|
; at the cost of potential code bloat and/or potentially successfully rotating
|
|
; the loops, vectorizing them at the cost of compile time.
|
|
|
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
|
|
|
declare void @f0()
|
|
declare void @f1()
|
|
declare void @f2()
|
|
|
|
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
|
|
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
|
|
|
|
define void @_Z4loopi(i32 %width) {
|
|
; HOIST-LABEL: @_Z4loopi(
|
|
; HOIST-NEXT: entry:
|
|
; HOIST-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; HOIST-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; HOIST: for.cond.preheader:
|
|
; HOIST-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; HOIST-NEXT: br label [[FOR_COND:%.*]]
|
|
; HOIST: for.cond:
|
|
; HOIST-NEXT: [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ]
|
|
; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
|
|
; HOIST-NEXT: tail call void @f0()
|
|
; HOIST-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
|
|
; HOIST: for.cond.cleanup:
|
|
; HOIST-NEXT: tail call void @f2()
|
|
; HOIST-NEXT: br label [[RETURN]]
|
|
; HOIST: for.body:
|
|
; HOIST-NEXT: tail call void @f1()
|
|
; HOIST-NEXT: [[INC]] = add nuw i32 [[I_0]], 1
|
|
; HOIST-NEXT: br label [[FOR_COND]]
|
|
; HOIST: return:
|
|
; HOIST-NEXT: ret void
|
|
;
|
|
; ROTATED_LATER_OLDPM-LABEL: @_Z4loopi(
|
|
; ROTATED_LATER_OLDPM-NEXT: entry:
|
|
; ROTATED_LATER_OLDPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; ROTATED_LATER_OLDPM: for.cond.preheader:
|
|
; ROTATED_LATER_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
|
; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
|
|
; ROTATED_LATER_OLDPM: for.body.preheader:
|
|
; ROTATED_LATER_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; ROTATED_LATER_OLDPM-NEXT: br label [[FOR_BODY:%.*]]
|
|
; ROTATED_LATER_OLDPM: for.cond.cleanup:
|
|
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
|
|
; ROTATED_LATER_OLDPM-NEXT: tail call void @f2()
|
|
; ROTATED_LATER_OLDPM-NEXT: br label [[RETURN]]
|
|
; ROTATED_LATER_OLDPM: for.body:
|
|
; ROTATED_LATER_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
|
|
; ROTATED_LATER_OLDPM-NEXT: tail call void @f1()
|
|
; ROTATED_LATER_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
|
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
|
|
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; ROTATED_LATER_OLDPM: return:
|
|
; ROTATED_LATER_OLDPM-NEXT: ret void
|
|
;
|
|
; ROTATED_LATER_NEWPM-LABEL: @_Z4loopi(
|
|
; ROTATED_LATER_NEWPM-NEXT: entry:
|
|
; ROTATED_LATER_NEWPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; ROTATED_LATER_NEWPM: for.cond.preheader:
|
|
; ROTATED_LATER_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
|
; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
|
|
; ROTATED_LATER_NEWPM: for.body.preheader:
|
|
; ROTATED_LATER_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; ROTATED_LATER_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
|
|
; ROTATED_LATER_NEWPM: for.cond.cleanup:
|
|
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
|
|
; ROTATED_LATER_NEWPM-NEXT: tail call void @f2()
|
|
; ROTATED_LATER_NEWPM-NEXT: br label [[RETURN]]
|
|
; ROTATED_LATER_NEWPM: for.body:
|
|
; ROTATED_LATER_NEWPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
|
|
; ROTATED_LATER_NEWPM-NEXT: tail call void @f1()
|
|
; ROTATED_LATER_NEWPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
|
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
|
|
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; ROTATED_LATER_NEWPM: return:
|
|
; ROTATED_LATER_NEWPM-NEXT: ret void
|
|
;
|
|
; ROTATE_OLDPM-LABEL: @_Z4loopi(
|
|
; ROTATE_OLDPM-NEXT: entry:
|
|
; ROTATE_OLDPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; ROTATE_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; ROTATE_OLDPM: for.cond.preheader:
|
|
; ROTATE_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
|
; ROTATE_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
|
|
; ROTATE_OLDPM: for.body.preheader:
|
|
; ROTATE_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; ROTATE_OLDPM-NEXT: br label [[FOR_BODY:%.*]]
|
|
; ROTATE_OLDPM: for.cond.cleanup:
|
|
; ROTATE_OLDPM-NEXT: tail call void @f0()
|
|
; ROTATE_OLDPM-NEXT: tail call void @f2()
|
|
; ROTATE_OLDPM-NEXT: br label [[RETURN]]
|
|
; ROTATE_OLDPM: for.body:
|
|
; ROTATE_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; ROTATE_OLDPM-NEXT: tail call void @f0()
|
|
; ROTATE_OLDPM-NEXT: tail call void @f1()
|
|
; ROTATE_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
|
; ROTATE_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
|
|
; ROTATE_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; ROTATE_OLDPM: return:
|
|
; ROTATE_OLDPM-NEXT: ret void
|
|
;
|
|
; ROTATE_NEWPM-LABEL: @_Z4loopi(
|
|
; ROTATE_NEWPM-NEXT: entry:
|
|
; ROTATE_NEWPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; ROTATE_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; ROTATE_NEWPM: for.cond.preheader:
|
|
; ROTATE_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
|
; ROTATE_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
|
|
; ROTATE_NEWPM: for.body.preheader:
|
|
; ROTATE_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; ROTATE_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
|
|
; ROTATE_NEWPM: for.cond.cleanup:
|
|
; ROTATE_NEWPM-NEXT: tail call void @f0()
|
|
; ROTATE_NEWPM-NEXT: tail call void @f2()
|
|
; ROTATE_NEWPM-NEXT: br label [[RETURN]]
|
|
; ROTATE_NEWPM: for.body:
|
|
; ROTATE_NEWPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
|
|
; ROTATE_NEWPM-NEXT: tail call void @f0()
|
|
; ROTATE_NEWPM-NEXT: tail call void @f1()
|
|
; ROTATE_NEWPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
|
; ROTATE_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
|
|
; ROTATE_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; ROTATE_NEWPM: return:
|
|
; ROTATE_NEWPM-NEXT: ret void
|
|
;
|
|
entry:
|
|
%width.addr = alloca i32, align 4
|
|
%i = alloca i32, align 4
|
|
store i32 %width, i32* %width.addr, align 4
|
|
%i1 = load i32, i32* %width.addr, align 4
|
|
%cmp = icmp slt i32 %i1, 1
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
if.then:
|
|
br label %return
|
|
|
|
if.end:
|
|
%i2 = bitcast i32* %i to i8*
|
|
call void @llvm.lifetime.start.p0i8(i64 4, i8* %i2)
|
|
store i32 0, i32* %i, align 4
|
|
br label %for.cond
|
|
|
|
for.cond:
|
|
%i3 = load i32, i32* %i, align 4
|
|
%i4 = load i32, i32* %width.addr, align 4
|
|
%sub = sub nsw i32 %i4, 1
|
|
%cmp1 = icmp slt i32 %i3, %sub
|
|
br i1 %cmp1, label %for.body, label %for.cond.cleanup
|
|
|
|
for.cond.cleanup:
|
|
%i5 = bitcast i32* %i to i8*
|
|
call void @llvm.lifetime.end.p0i8(i64 4, i8* %i5)
|
|
br label %for.end
|
|
|
|
for.body:
|
|
call void @f0()
|
|
call void @f1()
|
|
br label %for.inc
|
|
|
|
for.inc:
|
|
%i6 = load i32, i32* %i, align 4
|
|
%inc = add nsw i32 %i6, 1
|
|
store i32 %inc, i32* %i, align 4
|
|
br label %for.cond
|
|
|
|
for.end:
|
|
call void @f0()
|
|
call void @f2()
|
|
br label %return
|
|
|
|
return:
|
|
ret void
|
|
}
|