Files
clang-p2996/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll
Ryotaro Kasuga 17b202fc17 [LoopInterchange] Add an option to prioritize vectorization (#131988)
The LoopInterchange cost-model consists of several decision rules. They
are called one by one, and if some rule can determine the profitability,
then the subsequent rules aren't called. In the current implementation,
the rule for `CacheCostAnalysis` is called first, and if it fails to
determine the profitability, then the rule for vectorization is called.
However, there are cases where interchanging loops for vectorization
makes the code faster even if such exchanges are detrimental to the
cache. For example, exchanging the inner two loops in the following
example looks about x3 faster in my local (compiled with `-O3
-mcpu=neoverse-v2 -mllvm -cache-line-size=64`), even though it's
rejected by the rule based on cache cost. (NOTE: LoopInterchange cannot
exchange these loops due to legality checks. This should also be
improved.)

```c
__attribute__((aligned(64))) float aa[256][256],bb[256][256],cc[256][256],
                                   dd[256][256],ee[256][256],ff[256][256];

// Alternative of TSVC s231 with more array accesses than the original.
void s231_alternative() {
  for (int nl = 0; nl < 100*(100000/256); nl++) {
    for (int i = 0; i < 256; ++i) {
      for (int j = 1; j < 256; j++) {
        aa[j][i] = aa[j-1][i] + bb[j][i] + cc[i][j]
                 + dd[i][j] + ee[i][j] + ff[i][j];
      }
    }
  }
}
```

This patch introduces a new option to prioritize the vectorization rule
over the cache cost rule.

Related issue: #131130

---------

Co-authored-by: Florian Hahn <flo@fhahn.com>
2025-03-21 17:03:06 +09:00

82 lines
3.3 KiB
LLVM

; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
; RUN: -pass-remarks-output=%t -disable-output
; RUN: FileCheck -input-file %t --check-prefix=PROFIT-CACHE %s
; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \
; RUN: -pass-remarks-output=%t -disable-output -loop-interchange-prioritize-vectorization=1
; RUN: FileCheck -input-file %t --check-prefix=PROFIT-VEC %s
@A = dso_local global [256 x [256 x float]] zeroinitializer
@B = dso_local global [256 x [256 x float]] zeroinitializer
@C = dso_local global [256 x [256 x float]] zeroinitializer
@D = dso_local global [256 x [256 x float]] zeroinitializer
@E = dso_local global [256 x [256 x float]] zeroinitializer
@F = dso_local global [256 x [256 x float]] zeroinitializer
; Check the behavior of the LoopInterchange cost-model. In the below code,
; exchanging the loops is not profitable in terms of cache, but it is necessary
; to vectorize the innermost loop.
;
; for (int i = 0; i < 256; i++)
; for (int j = 1; j < 256; j++)
; A[j][i] = A[j-1][i] + B[j][i] + C[i][j] + D[i][j] + E[i][j] + F[i][j];
;
; PROFIT-CACHE: --- !Missed
; PROFIT-CACHE-NEXT: Pass: loop-interchange
; PROFIT-CACHE-NEXT: Name: InterchangeNotProfitable
; PROFIT-CACHE-NEXT: Function: f
; PROFIT-CACHE-NEXT: Args:
; PROFIT-CACHE-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization.
; PROFIT-CACHE-NEXT: ...
; PROFIT-VEC: --- !Passed
; PROFIT-VEC-NEXT: Pass: loop-interchange
; PROFIT-VEC-NEXT: Name: Interchanged
; PROFIT-VEC-NEXT: Function: f
; PROFIT-VEC-NEXT: Args:
; PROFIT-VEC-NEXT: - String: Loop interchanged with enclosing loop.
; PROFIT-VEC-NEXT: ...
define void @f() {
entry:
br label %for.i.header
for.i.header:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.i.inc ]
br label %for.j.body
for.j.body:
%j = phi i64 [ 1, %for.i.header ], [ %j.next, %for.j.body ]
%j.dec = add nsw i64 %j, -1
%a.0.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j.dec, i64 %i
%b.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @B, i64 %j, i64 %i
%c.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @C, i64 %i, i64 %j
%d.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @D, i64 %i, i64 %j
%e.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @E, i64 %i, i64 %j
%f.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @F, i64 %i, i64 %j
%a.0 = load float, ptr %a.0.index, align 4
%b = load float, ptr %b.index, align 4
%c = load float, ptr %c.index, align 4
%d = load float, ptr %d.index, align 4
%e = load float, ptr %e.index, align 4
%f = load float, ptr %f.index, align 4
%add.0 = fadd float %a.0, %b
%add.1 = fadd float %add.0, %c
%add.2 = fadd float %add.1, %d
%add.3 = fadd float %add.2, %e
%add.4 = fadd float %add.3, %f
%a.1.index = getelementptr nuw inbounds [256 x [256 x float]], ptr @A, i64 %j, i64 %i
store float %add.4, ptr %a.1.index, align 4
%j.next = add nuw nsw i64 %j, 1
%cmp.j = icmp eq i64 %j.next, 256
br i1 %cmp.j, label %for.i.inc, label %for.j.body
for.i.inc:
%i.next = add nuw nsw i64 %i, 1
%cmp.i = icmp eq i64 %i.next, 256
br i1 %cmp.i, label %exit, label %for.i.header
exit:
ret void
}