Files
clang-p2996/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
Aaron Liu d7e16ca28f [LV] Interleave to expose ILP for small loops with scalar reductions.
Interleave for small loops that have reductions inside,
which breaks dependencies and expose.

This gives very significant performance improvements for some benchmarks.
Because small loops could be in very hot functions in real applications.

Differential Revision: https://reviews.llvm.org/D81416
2020-09-01 19:47:32 +00:00

58 lines
2.3 KiB
LLVM

; RUN: opt < %s -loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
; CHECK-LABEL: vector.body
; CHECK: load double, double*
; CHECK-NEXT: load double, double*
; CHECK-NEXT: load double, double*
; CHECK-NEXT: load double, double*
; CHECK: fmul fast double
; CHECK-NEXT: fmul fast double
; CHECK-NEXT: fmul fast double
; CHECK-NEXT: fmul fast double
; CHECK: fadd fast double
; CHECK-NEXT: fadd fast double
; CHECK-NEXT: fadd fast double
; CHECK-NEXT: fadd fast double
target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"
define dso_local void @test(i32*** %arg, double** %arg1) align 2 {
bb:
%tpm15 = load i32**, i32*** %arg, align 8
%tpm19 = load double*, double** %arg1, align 8
br label %bb22
bb22: ; preds = %bb33, %bb
%tpm26 = add i64 0, 1
%tpm10 = alloca i32, align 8
%tpm27 = getelementptr inbounds i32, i32* %tpm10, i64 %tpm26
%tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 0
%tpm29 = load i32*, i32** %tpm28, align 8
%tpm17 = alloca double, align 8
%tpm32 = getelementptr inbounds double, double* %tpm17, i64 %tpm26
br label %bb40
bb33: ; preds = %bb40
%tpm35 = getelementptr inbounds double, double* %tpm19, i64 0
%tpm37 = fsub fast double 0.000000e+00, %tpm50
store double %tpm37, double* %tpm35, align 8
br label %bb22
bb40: ; preds = %bb40, %bb22
%tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ]
%tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ]
%tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ]
%tpm44 = load double, double* %tpm42, align 8
%tpm45 = load i32, i32* %tpm41, align 4
%tpm46 = zext i32 %tpm45 to i64
%tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46
%tpm48 = load double, double* %tpm47, align 8
%tpm49 = fmul fast double %tpm48, %tpm44
%tpm50 = fadd fast double %tpm49, %tpm43
%tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1
%tpm52 = getelementptr inbounds double, double* %tpm42, i64 1
%tpm53 = icmp eq i32* %tpm51, %tpm29
br i1 %tpm53, label %bb33, label %bb40
}