The basic problem we have is that we're trying to reuse an instruction which is mapped to some SCEV. Since we can have multiple such instructions (potentially with different flags), this is analogous to our need to drop flags when performing CSE. A trivial implementation would simply drop flags on any instruction we decided to reuse, and that would be correct. This patch is almost that trivial patch except that we preserve flags on the reused instruction when existing users would imply UB on overflow already. Adding new users can, at most, refine this program to one which doesn't execute UB which is valid. In practice, this fixes two conceptual problems with the previous code: 1) a binop could have been canonicalized into a form with different opcode or operands, or 2) the inbounds GEP case which was simply unhandled. On the test changes, most are pretty straight forward. We loose some flags (in some cases, they'd have been dropped on the next CSE pass anyways). The one that took me the longest to understand was the ashr-expansion test. What's happening there is that we're considering reuse of the mul, previously we disallowed it entirely, now we allow it with no flags. The surrounding diffs are all effects of generating the same mul with a different operand order, and then doing simple DCE. The loss of the inbounds is unfortunate, but even there, we can recover most of those once we actually treat branch-on-poison as immediate UB. Differential Revision: https://reviews.llvm.org/D112734
210 lines
9.1 KiB
LLVM
210 lines
9.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -O3 -rotation-max-header-size=0 -S -enable-new-pm=0 < %s | FileCheck %s --check-prefix=HOIST
|
|
; RUN: opt -passes='default<O3>' -rotation-max-header-size=0 -S < %s | FileCheck %s --check-prefix=HOIST
|
|
|
|
; RUN: opt -O3 -rotation-max-header-size=1 -S -enable-new-pm=0 < %s | FileCheck %s --check-prefix=HOIST
|
|
; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefix=HOIST
|
|
|
|
; RUN: opt -O3 -rotation-max-header-size=2 -S -enable-new-pm=0 < %s | FileCheck %s --check-prefix=ROTATED_LATER_OLDPM
|
|
; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefix=ROTATED_LATER_NEWPM
|
|
|
|
; RUN: opt -O3 -rotation-max-header-size=3 -S -enable-new-pm=0 < %s | FileCheck %s --check-prefix=ROTATE_OLDPM
|
|
; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefix=ROTATE_NEWPM
|
|
|
|
; This example is produced from a very basic C code:
|
|
;
|
|
; void f0();
|
|
; void f1();
|
|
; void f2();
|
|
;
|
|
; void loop(int width) {
|
|
; if(width < 1)
|
|
; return;
|
|
; for(int i = 0; i < width - 1; ++i) {
|
|
; f0();
|
|
; f1();
|
|
; }
|
|
; f0();
|
|
; f2();
|
|
; }
|
|
|
|
; We have a choice here. We can either
|
|
; * hoist the f0() call into loop header,
|
|
; * which potentially makes loop rotation unprofitable since loop header might
|
|
; have grown above certain threshold, and such unrotated loops will be
|
|
; ignored by LoopVectorizer, preventing vectorization
|
|
; * or loop rotation will succeed, resulting in some weird PHIs that will also
|
|
; harm vectorization
|
|
; * or not hoist f0() call before performing loop rotation,
|
|
; at the cost of potential code bloat and/or potentially successfully rotating
|
|
; the loops, vectorizing them at the cost of compile time.
|
|
|
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
|
|
|
declare void @f0()
|
|
declare void @f1()
|
|
declare void @f2()
|
|
|
|
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
|
|
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
|
|
|
|
define void @_Z4loopi(i32 %width) {
|
|
; HOIST-LABEL: @_Z4loopi(
|
|
; HOIST-NEXT: entry:
|
|
; HOIST-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; HOIST-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; HOIST: for.cond.preheader:
|
|
; HOIST-NEXT: [[SUB:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; HOIST-NEXT: br label [[FOR_COND:%.*]]
|
|
; HOIST: for.cond:
|
|
; HOIST-NEXT: [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ]
|
|
; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[SUB]]
|
|
; HOIST-NEXT: tail call void @f0()
|
|
; HOIST-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
|
|
; HOIST: for.cond.cleanup:
|
|
; HOIST-NEXT: tail call void @f2()
|
|
; HOIST-NEXT: br label [[RETURN]]
|
|
; HOIST: for.body:
|
|
; HOIST-NEXT: tail call void @f1()
|
|
; HOIST-NEXT: [[INC]] = add nuw i32 [[I_0]], 1
|
|
; HOIST-NEXT: br label [[FOR_COND]]
|
|
; HOIST: return:
|
|
; HOIST-NEXT: ret void
|
|
;
|
|
; ROTATED_LATER_OLDPM-LABEL: @_Z4loopi(
|
|
; ROTATED_LATER_OLDPM-NEXT: entry:
|
|
; ROTATED_LATER_OLDPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; ROTATED_LATER_OLDPM: for.cond.preheader:
|
|
; ROTATED_LATER_OLDPM-NEXT: [[SUB:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; ROTATED_LATER_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
|
; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
|
|
; ROTATED_LATER_OLDPM: for.cond.cleanup:
|
|
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
|
|
; ROTATED_LATER_OLDPM-NEXT: tail call void @f2()
|
|
; ROTATED_LATER_OLDPM-NEXT: br label [[RETURN]]
|
|
; ROTATED_LATER_OLDPM: for.body:
|
|
; ROTATED_LATER_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
|
|
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
|
|
; ROTATED_LATER_OLDPM-NEXT: tail call void @f1()
|
|
; ROTATED_LATER_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
|
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[SUB]]
|
|
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; ROTATED_LATER_OLDPM: return:
|
|
; ROTATED_LATER_OLDPM-NEXT: ret void
|
|
;
|
|
; ROTATED_LATER_NEWPM-LABEL: @_Z4loopi(
|
|
; ROTATED_LATER_NEWPM-NEXT: entry:
|
|
; ROTATED_LATER_NEWPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; ROTATED_LATER_NEWPM: for.cond.preheader:
|
|
; ROTATED_LATER_NEWPM-NEXT: [[SUB:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; ROTATED_LATER_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
|
; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
|
|
; ROTATED_LATER_NEWPM: for.cond.cleanup:
|
|
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
|
|
; ROTATED_LATER_NEWPM-NEXT: tail call void @f2()
|
|
; ROTATED_LATER_NEWPM-NEXT: br label [[RETURN]]
|
|
; ROTATED_LATER_NEWPM: for.body:
|
|
; ROTATED_LATER_NEWPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
|
|
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
|
|
; ROTATED_LATER_NEWPM-NEXT: tail call void @f1()
|
|
; ROTATED_LATER_NEWPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
|
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[SUB]]
|
|
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; ROTATED_LATER_NEWPM: return:
|
|
; ROTATED_LATER_NEWPM-NEXT: ret void
|
|
;
|
|
; ROTATE_OLDPM-LABEL: @_Z4loopi(
|
|
; ROTATE_OLDPM-NEXT: entry:
|
|
; ROTATE_OLDPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; ROTATE_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; ROTATE_OLDPM: for.cond.preheader:
|
|
; ROTATE_OLDPM-NEXT: [[SUB:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; ROTATE_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
|
; ROTATE_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
|
|
; ROTATE_OLDPM: for.cond.cleanup:
|
|
; ROTATE_OLDPM-NEXT: tail call void @f0()
|
|
; ROTATE_OLDPM-NEXT: tail call void @f2()
|
|
; ROTATE_OLDPM-NEXT: br label [[RETURN]]
|
|
; ROTATE_OLDPM: for.body:
|
|
; ROTATE_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
|
|
; ROTATE_OLDPM-NEXT: tail call void @f0()
|
|
; ROTATE_OLDPM-NEXT: tail call void @f1()
|
|
; ROTATE_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
|
; ROTATE_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[SUB]]
|
|
; ROTATE_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; ROTATE_OLDPM: return:
|
|
; ROTATE_OLDPM-NEXT: ret void
|
|
;
|
|
; ROTATE_NEWPM-LABEL: @_Z4loopi(
|
|
; ROTATE_NEWPM-NEXT: entry:
|
|
; ROTATE_NEWPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
|
|
; ROTATE_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
|
|
; ROTATE_NEWPM: for.cond.preheader:
|
|
; ROTATE_NEWPM-NEXT: [[SUB:%.*]] = add nsw i32 [[WIDTH]], -1
|
|
; ROTATE_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
|
|
; ROTATE_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
|
|
; ROTATE_NEWPM: for.cond.cleanup:
|
|
; ROTATE_NEWPM-NEXT: tail call void @f0()
|
|
; ROTATE_NEWPM-NEXT: tail call void @f2()
|
|
; ROTATE_NEWPM-NEXT: br label [[RETURN]]
|
|
; ROTATE_NEWPM: for.body:
|
|
; ROTATE_NEWPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
|
|
; ROTATE_NEWPM-NEXT: tail call void @f0()
|
|
; ROTATE_NEWPM-NEXT: tail call void @f1()
|
|
; ROTATE_NEWPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
|
|
; ROTATE_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[SUB]]
|
|
; ROTATE_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
|
|
; ROTATE_NEWPM: return:
|
|
; ROTATE_NEWPM-NEXT: ret void
|
|
;
|
|
entry:
|
|
%width.addr = alloca i32, align 4
|
|
%i = alloca i32, align 4
|
|
store i32 %width, i32* %width.addr, align 4
|
|
%i1 = load i32, i32* %width.addr, align 4
|
|
%cmp = icmp slt i32 %i1, 1
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
if.then:
|
|
br label %return
|
|
|
|
if.end:
|
|
%i2 = bitcast i32* %i to i8*
|
|
call void @llvm.lifetime.start.p0i8(i64 4, i8* %i2)
|
|
store i32 0, i32* %i, align 4
|
|
br label %for.cond
|
|
|
|
for.cond:
|
|
%i3 = load i32, i32* %i, align 4
|
|
%i4 = load i32, i32* %width.addr, align 4
|
|
%sub = sub nsw i32 %i4, 1
|
|
%cmp1 = icmp slt i32 %i3, %sub
|
|
br i1 %cmp1, label %for.body, label %for.cond.cleanup
|
|
|
|
for.cond.cleanup:
|
|
%i5 = bitcast i32* %i to i8*
|
|
call void @llvm.lifetime.end.p0i8(i64 4, i8* %i5)
|
|
br label %for.end
|
|
|
|
for.body:
|
|
call void @f0()
|
|
call void @f1()
|
|
br label %for.inc
|
|
|
|
for.inc:
|
|
%i6 = load i32, i32* %i, align 4
|
|
%inc = add nsw i32 %i6, 1
|
|
store i32 %inc, i32* %i, align 4
|
|
br label %for.cond
|
|
|
|
for.end:
|
|
call void @f0()
|
|
call void @f2()
|
|
br label %return
|
|
|
|
return:
|
|
ret void
|
|
}
|