The LoopInterchange cost-model consists of several decision rules. They
are called one by one, and if some rule can determine the profitability,
then the subsequent rules aren't called. In the current implementation,
the rule for `CacheCostAnalysis` is called first, and if it fails to
determine the profitability, then the rule for vectorization is called.
However, there are cases where interchanging loops for vectorization
makes the code faster even if such exchanges are detrimental to the
cache. For example, exchanging the inner two loops in the following
example looks about x3 faster in my local (compiled with `-O3
-mcpu=neoverse-v2 -mllvm -cache-line-size=64`), even though it's
rejected by the rule based on cache cost. (NOTE: LoopInterchange cannot
exchange these loops due to legality checks. This should also be
improved.)
```c
__attribute__((aligned(64))) float aa[256][256],bb[256][256],cc[256][256],
dd[256][256],ee[256][256],ff[256][256];
// Alternative of TSVC s231 with more array accesses than the original.
void s231_alternative() {
for (int nl = 0; nl < 100*(100000/256); nl++) {
for (int i = 0; i < 256; ++i) {
for (int j = 1; j < 256; j++) {
aa[j][i] = aa[j-1][i] + bb[j][i] + cc[i][j]
+ dd[i][j] + ee[i][j] + ff[i][j];
}
}
}
}
```
This patch introduces a new option to prioritize the vectorization rule
over the cache cost rule.
Related issue: #131130
---------
Co-authored-by: Florian Hahn <flo@fhahn.com>
82 lines
3.3 KiB
LLVM
82 lines
3.3 KiB
LLVM
; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
|
|
; RUN: -pass-remarks-output=%t -disable-output
|
|
; RUN: FileCheck -input-file %t --check-prefix=PROFIT-CACHE %s
|
|
|
|
; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
|
|
; RUN: -pass-remarks-output=%t -disable-output -loop-interchange-prioritize-vectorization=1
|
|
; RUN: FileCheck -input-file %t --check-prefix=PROFIT-VEC %s
|
|
|
|
@A = dso_local global [256 x [256 x float]] zeroinitializer
|
|
@B = dso_local global [256 x [256 x float]] zeroinitializer
|
|
@C = dso_local global [256 x [256 x float]] zeroinitializer
|
|
@D = dso_local global [256 x [256 x float]] zeroinitializer
|
|
@E = dso_local global [256 x [256 x float]] zeroinitializer
|
|
@F = dso_local global [256 x [256 x float]] zeroinitializer
|
|
|
|
; Check the behavior of the LoopInterchange cost-model. In the below code,
|
|
; exchanging the loops is not profitable in terms of cache, but it is necessary
|
|
; to vectorize the innermost loop.
|
|
;
|
|
; for (int i = 0; i < 256; i++)
|
|
; for (int j = 1; j < 256; j++)
|
|
; A[j][i] = A[j-1][i] + B[j][i] + C[i][j] + D[i][j] + E[i][j] + F[i][j];
|
|
;
|
|
|
|
; PROFIT-CACHE: --- !Missed
|
|
; PROFIT-CACHE-NEXT: Pass: loop-interchange
|
|
; PROFIT-CACHE-NEXT: Name: InterchangeNotProfitable
|
|
; PROFIT-CACHE-NEXT: Function: f
|
|
; PROFIT-CACHE-NEXT: Args:
|
|
; PROFIT-CACHE-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
|
|
; PROFIT-CACHE-NEXT: ...
|
|
|
|
; PROFIT-VEC: --- !Passed
|
|
; PROFIT-VEC-NEXT: Pass: loop-interchange
|
|
; PROFIT-VEC-NEXT: Name: Interchanged
|
|
; PROFIT-VEC-NEXT: Function: f
|
|
; PROFIT-VEC-NEXT: Args:
|
|
; PROFIT-VEC-NEXT: - String: Loop interchanged with enclosing loop.
|
|
; PROFIT-VEC-NEXT: ...
|
|
define void @f() {
|
|
entry:
|
|
br label %for.i.header
|
|
|
|
for.i.header:
|
|
%i = phi i64 [ 0, %entry ], [ %i.next, %for.i.inc ]
|
|
br label %for.j.body
|
|
|
|
for.j.body:
|
|
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
|
|
%j.dec = add nsw i64 %j, -1
|
|
%a.0.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j.dec, i64 %i
|
|
%b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %j, i64 %i
|
|
%c.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
|
|
%d.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @D, i64 %i, i64 %j
|
|
%e.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @E, i64 %i, i64 %j
|
|
%f.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @F, i64 %i, i64 %j
|
|
%a.0 = load float, ptr %a.0.index, align 4
|
|
%b = load float, ptr %b.index, align 4
|
|
%c = load float, ptr %c.index, align 4
|
|
%d = load float, ptr %d.index, align 4
|
|
%e = load float, ptr %e.index, align 4
|
|
%f = load float, ptr %f.index, align 4
|
|
%add.0 = fadd float %a.0, %b
|
|
%add.1 = fadd float %add.0, %c
|
|
%add.2 = fadd float %add.1, %d
|
|
%add.3 = fadd float %add.2, %e
|
|
%add.4 = fadd float %add.3, %f
|
|
%a.1.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j, i64 %i
|
|
store float %add.4, ptr %a.1.index, align 4
|
|
%j.next = add nuw nsw i64 %j, 1
|
|
%cmp.j = icmp eq i64 %j.next, 256
|
|
br i1 %cmp.j, label %for.i.inc, label %for.j.body
|
|
|
|
for.i.inc:
|
|
%i.next = add nuw nsw i64 %i, 1
|
|
%cmp.i = icmp eq i64 %i.next, 256
|
|
br i1 %cmp.i, label %exit, label %for.i.header
|
|
|
|
exit:
|
|
ret void
|
|
}
|