This is a simple implementation of the unroll-and-jam classical loop
optimisation.
The basic idea is that we take an outer loop of the form:
for i..
ForeBlocks(i)
for j..
SubLoopBlocks(i, j)
AftBlocks(i)
Instead of doing normal inner or outer unrolling, we unroll as follows:
for i... i+=2
ForeBlocks(i)
ForeBlocks(i+1)
for j..
SubLoopBlocks(i, j)
SubLoopBlocks(i+1, j)
AftBlocks(i)
AftBlocks(i+1)
Remainder Loop
So we have unrolled the outer loop, then jammed the two inner loops into
one. This can lead to a simpler inner loop if memory accesses can be shared
between the now jammed loops.
To do this we have to prove that this is all safe, both for the memory
accesses (using dependence analysis) and that ForeBlocks(i+1) can move before
AftBlocks(i) and SubLoopBlocks(i, j).
Differential Revision: https://reviews.llvm.org/D41953
llvm-svn: 336062
471 lines
14 KiB
LLVM
471 lines
14 KiB
LLVM
; RUN: opt -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
|
|
|
|
; CHECK-LABEL: fore_aft_less
|
|
; CHECK: %j = phi
|
|
; CHECK: %j.1 = phi
|
|
; CHECK: %j.2 = phi
|
|
; CHECK: %j.3 = phi
|
|
define void @fore_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%add72 = add nuw nsw i32 %i, -1
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: fore_aft_eq
|
|
; CHECK: %j = phi
|
|
; CHECK: %j.1 = phi
|
|
; CHECK: %j.2 = phi
|
|
; CHECK: %j.3 = phi
|
|
define void @fore_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%add72 = add nuw nsw i32 %i, 0
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: fore_aft_more
|
|
; CHECK: %j = phi
|
|
; CHECK-NOT: %j.1 = phi
|
|
define void @fore_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%add72 = add nuw nsw i32 %i, 1
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: fore_sub_less
|
|
; CHECK: %j = phi
|
|
; CHECK: %j.1 = phi
|
|
; CHECK: %j.2 = phi
|
|
; CHECK: %j.3 = phi
|
|
define void @fore_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add72 = add nuw nsw i32 %i, -1
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: fore_sub_eq
|
|
; CHECK: %j = phi
|
|
; CHECK: %j.1 = phi
|
|
; CHECK: %j.2 = phi
|
|
; CHECK: %j.3 = phi
|
|
define void @fore_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add72 = add nuw nsw i32 %i, 0
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: fore_sub_more
|
|
; CHECK: %j = phi
|
|
; CHECK-NOT: %j.1 = phi
|
|
define void @fore_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add72 = add nuw nsw i32 %i, 1
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: sub_aft_less
|
|
; CHECK: %j = phi
|
|
; CHECK: %j.1 = phi
|
|
; CHECK: %j.2 = phi
|
|
; CHECK: %j.3 = phi
|
|
define void @sub_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%add72 = add nuw nsw i32 %i, -1
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: sub_aft_eq
|
|
; CHECK: %j = phi
|
|
; CHECK: %j.1 = phi
|
|
; CHECK: %j.2 = phi
|
|
; CHECK: %j.3 = phi
|
|
define void @sub_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%add72 = add nuw nsw i32 %i, 0
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: sub_aft_more
|
|
; CHECK: %j = phi
|
|
; CHECK-NOT: %j.1 = phi
|
|
define void @sub_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%add72 = add nuw nsw i32 %i, 1
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: sub_sub_less
|
|
; CHECK: %j = phi
|
|
; CHECK-NOT: %j.1 = phi
|
|
define void @sub_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
%add72 = add nuw nsw i32 %i, -1
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: sub_sub_eq
|
|
; CHECK: %j = phi
|
|
; CHECK: %j.1 = phi
|
|
define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
%add72 = add nuw nsw i32 %i, 0
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|
|
|
|
|
|
; CHECK-LABEL: sub_sub_more
|
|
; CHECK: %j = phi
|
|
; CHECK-NOT: %j.1 = phi
|
|
define void @sub_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
|
|
entry:
|
|
%cmp = icmp sgt i32 %N, 0
|
|
br i1 %cmp, label %for.outer, label %cleanup
|
|
|
|
for.outer:
|
|
%i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
|
|
br label %for.inner
|
|
|
|
for.inner:
|
|
%j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
|
|
%sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
|
|
%arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
|
|
%0 = load i32, i32* %arrayidx5, align 4
|
|
%mul = mul nsw i32 %0, %i
|
|
%add = add nsw i32 %mul, %sum
|
|
%add6 = add nuw nsw i32 %j, 1
|
|
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
|
|
store i32 1, i32* %arrayidx, align 4
|
|
%add72 = add nuw nsw i32 %i, 1
|
|
%arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
|
|
store i32 %add, i32* %arrayidx8, align 4
|
|
%exitcond = icmp eq i32 %add6, %N
|
|
br i1 %exitcond, label %for.latch, label %for.inner
|
|
|
|
for.latch:
|
|
%add7 = add nuw nsw i32 %i, 1
|
|
%exitcond29 = icmp eq i32 %add7, %N
|
|
br i1 %exitcond29, label %cleanup, label %for.outer
|
|
|
|
cleanup:
|
|
ret void
|
|
}
|