Files
clang-p2996/llvm/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll
Sam Parker a16667e79b [ARM] Use Cortex-A57 sched model for Cortex-A72
This mirrors what we already do for AArch64 as the cores are similar.
As discussed in the review, enabling the machine scheduler causes
more variations in performance changes so it is not enabled for now.
This patch improves LNT scores by a geomean of 1.57% at -O3.

Differential Revision: https://reviews.llvm.org/D53562

llvm-svn: 345272
2018-10-25 15:08:29 +00:00

248 lines
11 KiB
LLVM

; RUN: opt -mtriple=armv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
; RUN: opt -mtriple=thumbv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
; RUN: opt -mtriple=thumbv7 -mcpu=cortex-a72 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
; RUN: opt -mtriple=thumbv8m -mcpu=cortex-m23 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T1
; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T2
; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m7 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T2
; CHECK-LABEL: partial
define arm_aapcs_vfpcc void @partial(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B) local_unnamed_addr #0 {
entry:
br label %for.body
; CHECK-LABEL: for.body
for.body:
; CHECK-UNROLL-A: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV2:%[a-z.0-9]+]], %for.body ]
; CHECK-UNROLL-A: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-A: [[IV2]] = add nuw nsw i32 [[IV1]], 1
; CHECK-UNROLL-A: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV2]], 1024
; CHECK-UNROLL-A: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body
; CHECK-UNROLL-T1: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
; CHECK-UNROLL-T1: [[IV1]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-T1: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV1]], 1024
; CHECK-UNROLL-T1: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body
; CHECK-UNROLL-T2: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV16:%[a-z.0-9]+]], %for.body ]
; CHECK-UNROLL-T2: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-T2: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1
; CHECK-UNROLL-T2: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1
; CHECK-UNROLL-T2: [[IV4:%[a-z.0-9]+]] = add nuw nsw i32 [[IV3]], 1
; CHECK-UNROLL-T2: [[IV5:%[a-z.0-9]+]] = add nuw nsw i32 [[IV4]], 1
; CHECK-UNROLL-T2: [[IV6:%[a-z.0-9]+]] = add nuw nsw i32 [[IV5]], 1
; CHECK-UNROLL-T2: [[IV7:%[a-z.0-9]+]] = add nuw nsw i32 [[IV6]], 1
; CHECK-UNROLL-T2: [[IV8:%[a-z.0-9]+]] = add nuw nsw i32 [[IV7]], 1
; CHECK-UNROLL-T2: [[IV9:%[a-z.0-9]+]] = add nuw nsw i32 [[IV8]], 1
; CHECK-UNROLL-T2: [[IV10:%[a-z.0-9]+]] = add nuw nsw i32 [[IV9]], 1
; CHECK-UNROLL-T2: [[IV11:%[a-z.0-9]+]] = add nuw nsw i32 [[IV10]], 1
; CHECK-UNROLL-T2: [[IV12:%[a-z.0-9]+]] = add nuw nsw i32 [[IV11]], 1
; CHECK-UNROLL-T2: [[IV13:%[a-z.0-9]+]] = add nuw nsw i32 [[IV12]], 1
; CHECK-UNROLL-T2: [[IV14:%[a-z.0-9]+]] = add nuw nsw i32 [[IV13]], 1
; CHECK-UNROLL-T2: [[IV15:%[a-z.0-9]+]] = add nuw nsw i32 [[IV14]], 1
; CHECK-UNROLL-T2: [[IV16]] = add nuw nsw i32 [[IV15]], 1
; CHECK-UNROLL-T2: [[CMP:%[a-z.0-9]+]] = icmp eq i32 [[IV16]], 1024
; CHECK-UNROLL-T2: br i1 [[CMP]], label [[END:%[a-z.]+]], label %for.body
%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.08
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.08
%1 = load i32, i32* %arrayidx1, align 4
%mul = mul nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.08
store i32 %mul, i32* %arrayidx2, align 4
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup:
ret void
}
; CHECK-LABEL: runtime
define arm_aapcs_vfpcc void @runtime(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %for.body
; CHECK-LABEL: for.body
for.body:
; CHECK-UNROLL-A: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV2:%[a-z.0-9]+]], %for.body ]
; CHECK-UNROLL-A: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-A: [[IV2]] = add nuw i32 [[IV1]], 1
; CHECK-UNROLL-A: br
; CHECK-UNROLL-T1: %i.09 = phi i32 [ %inc, %for.body ], [ 0
; CHECK-UNROLL-T1: %inc = add nuw i32 %i.09, 1
; CHECK-UNROLL-T1: %exitcond = icmp eq i32 %inc, %N
; CHECK-UNROLL-T1: br
; CHECK-UNROLL-T2: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z.0-9]+]] ], [ [[IV4:%[a-z.0-9]+]], %for.body ]
; CHECK-UNROLL-T2: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-T2: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1
; CHECK-UNROLL-T2: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1
; CHECK-UNROLL-T2: [[IV4]] = add nuw i32 [[IV3]], 1
; CHECK-UNROLL-T2: br
; CHECK-UNROLL-T2: for.body.epil:
; CHECK-UNROLL-T2: for.body.epil.1:
; CHECK-UNROLL-T2: for.body.epil.2:
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.09
%1 = load i32, i32* %arrayidx1, align 4
%mul = mul nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.09
store i32 %mul, i32* %arrayidx2, align 4
%inc = add nuw i32 %i.09, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup, label %for.body
for.cond.cleanup:
ret void
}
; CHECK-LABEL: nested_runtime
define arm_aapcs_vfpcc void @nested_runtime(i32* nocapture %C, i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
entry:
%cmp25 = icmp eq i32 %N, 0
br i1 %cmp25, label %for.cond.cleanup, label %for.body4.lr.ph
for.body4.lr.ph:
%h.026 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %entry ]
%mul = mul i32 %h.026, %N
br label %for.body4
for.cond.cleanup:
ret void
for.cond.cleanup3:
%inc11 = add nuw i32 %h.026, 1
%exitcond27 = icmp eq i32 %inc11, %N
br i1 %exitcond27, label %for.cond.cleanup, label %for.body4.lr.ph
; CHECK-LABEL: for.body4
for.body4:
; CHECK-UNROLL-T1: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV1:%[a-z.0-9]+]], %for.body4 ]
; CHECK-UNROLL-T1: [[IV1]] = add nuw i32 [[IV0]], 1
; CHECK-UNROLL-T1: br
; CHECK-UNROLL-T2: for.body4.epil:
; CHECK-UNROLL-T2: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, [[PRE:%[a-z0-9.]+]] ], [ [[IV4:%[a-z.0-9]+]], %for.body4 ]
; CHECK-UNROLL-T2: [[IV1:%[a-z.0-9]+]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-T2: [[IV2:%[a-z.0-9]+]] = add nuw nsw i32 [[IV1]], 1
; CHECK-UNROLL-T2: [[IV3:%[a-z.0-9]+]] = add nuw nsw i32 [[IV2]], 1
; CHECK-UNROLL-T2: [[IV4]] = add nuw i32 [[IV3]], 1
; CHECK-UNROLL-T2: br
; CHECK-UNROLL-T2: for.body4.epil.1:
; CHECK-UNROLL-T2: for.body4.epil.2:
%w.024 = phi i32 [ 0, %for.body4.lr.ph ], [ %inc, %for.body4 ]
%add = add i32 %w.024, %mul
%arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
%0 = load i16, i16* %arrayidx, align 2
%conv = sext i16 %0 to i32
%arrayidx5 = getelementptr inbounds i16, i16* %B, i32 %w.024
%1 = load i16, i16* %arrayidx5, align 2
%conv6 = sext i16 %1 to i32
%mul7 = mul nsw i32 %conv6, %conv
%arrayidx8 = getelementptr inbounds i32, i32* %C, i32 %w.024
%2 = load i32, i32* %arrayidx8, align 4
%add9 = add nsw i32 %mul7, %2
store i32 %add9, i32* %arrayidx8, align 4
%inc = add nuw i32 %w.024, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup3, label %for.body4
}
; CHECK-LABEL: loop_call
define arm_aapcs_vfpcc void @loop_call(i32* nocapture %C, i32* nocapture readonly %A, i32* nocapture readonly %B) local_unnamed_addr #1 {
entry:
br label %for.body
for.cond.cleanup:
ret void
; CHECK-LABEL: for.body
for.body:
; CHECK-UNROLL-A: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
; CHECK-UNROLL-A: [[IV1]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-A: icmp eq i32 [[IV1]], 1024
; CHECK-UNROLL-A: br
; CHECK-UNROLL-T1: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
; CHECK-UNROLL-T1: [[IV1]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-T1: icmp eq i32 [[IV1]], 1024
; CHECK-UNROLL-T1: br
; CHECK-UNROLL-T2: [[IV0:%[a-z.0-9]+]] = phi i32 [ 0, %entry ], [ [[IV1:%[a-z.0-9]+]], %for.body ]
; CHECK-UNROLL-T2: [[IV1]] = add nuw nsw i32 [[IV0]], 1
; CHECK-UNROLL-T2: icmp eq i32 [[IV1]], 1024
; CHECK-UNROLL-T2: br
%i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.08
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.08
%1 = load i32, i32* %arrayidx1, align 4
%call = tail call arm_aapcs_vfpcc i32 @some_func(i32 %0, i32 %1) #3
%arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.08
store i32 %call, i32* %arrayidx2, align 4
%inc = add nuw nsw i32 %i.08, 1
%exitcond = icmp eq i32 %inc, 1024
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: iterate_inc
; CHECK-UNROLL-A: %n.addr.04 = phi %struct.Node* [ %1, %while.body ], [ %n, %while.body.preheader ]
; CHECK-UNROLL-A: %tobool = icmp eq %struct.Node* %1, null
; CHECK-UNROLL-A: br i1 %tobool
; CHECK-UNROLL-A-NOT: load
; CHECK-UNROLL-T1: %n.addr.04 = phi %struct.Node* [ %1, %while.body ], [ %n, %while.body.preheader ]
; CHECK-UNROLL-T1: %tobool = icmp eq %struct.Node* %1, null
; CHECK-UNROLL-T1: br i1 %tobool
; CHECK-UNROLL-T1-NOT: load
; CHECK-UNROLL-T2: [[CMP0:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR0:%[a-z.0-9]+]], null
; CHECK-UNROLL-T2: br i1 [[CMP0]], label [[END:%[a-z.0-9]+]]
; CHECK-UNROLL-T2: [[CMP1:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR1:%[a-z.0-9]+]], null
; CHECK-UNROLL-T2: br i1 [[CMP1]], label [[END]]
; CHECK-UNROLL-T2: [[CMP2:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR2:%[a-z.0-9]+]], null
; CHECK-UNROLL-T2: br i1 [[CMP2]], label [[END]]
; CHECK-UNROLL-T2: [[CMP3:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR3:%[a-z.0-9]+]], null
; CHECK-UNROLL-T2: br i1 [[CMP3]], label [[END]]
; CHECK-UNROLL-T2: [[CMP4:%[a-z.0-9]+]] = icmp eq %struct.Node* [[VAR4:%[a-z.0-9]+]], null
; CHECK-UNROLL-T2: br i1 [[CMP4]], label [[END]]
; CHECK-UNROLL-T2-NOT: load
%struct.Node = type { %struct.Node*, i32 }
define arm_aapcscc void @iterate_inc(%struct.Node* %n) local_unnamed_addr #0 {
entry:
%tobool3 = icmp eq %struct.Node* %n, null
br i1 %tobool3, label %while.end, label %while.body.preheader
while.body.preheader:
br label %while.body
while.body:
%n.addr.04 = phi %struct.Node* [ %1, %while.body ], [ %n, %while.body.preheader ]
%val = getelementptr inbounds %struct.Node, %struct.Node* %n.addr.04, i32 0, i32 1
%0 = load i32, i32* %val, align 4
%add = add nsw i32 %0, 1
store i32 %add, i32* %val, align 4
%next = getelementptr inbounds %struct.Node, %struct.Node* %n.addr.04, i32 0, i32 0
%1 = load %struct.Node*, %struct.Node** %next, align 4
%tobool = icmp eq %struct.Node* %1, null
br i1 %tobool, label %while.end, label %while.body
while.end:
ret void
}
declare arm_aapcs_vfpcc i32 @some_func(i32, i32) local_unnamed_addr #2