Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll
Alexey Bataev b28f407df9 [SLP]Improve reduction cost model for scalars.
Instead of abstract cost of the scalar reduction ops, try to use the
cost of actual reduction operation instructions, where possible. Also,
remove the estimation of the vectorized GEPs pointers for reduced loads,
since it is already handled in the tree.

Differential Revision: https://reviews.llvm.org/D148036
2023-04-12 11:32:51 -07:00

118 lines
6.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
declare i32 @llvm.smax.i32(i32, i32)
define i32 @smax_v2i32(i32) {
; CHECK-LABEL: @smax_v2i32(
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
; CHECK-NEXT: ret i32 [[TMP4]]
;
%2 = load i32, ptr @arr, align 16
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
%4 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
ret i32 %4
}
define i32 @smax_v4i32(i32) {
; SSE2-LABEL: @smax_v4i32(
; SSE2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16
; SSE2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
; SSE2-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
; SSE2-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
; SSE2-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
; SSE2-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
; SSE2-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
; SSE2-NEXT: ret i32 [[TMP8]]
;
; SSE4-LABEL: @smax_v4i32(
; SSE4-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
; SSE4-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
; SSE4-NEXT: ret i32 [[TMP3]]
;
; AVX-LABEL: @smax_v4i32(
; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
; AVX-NEXT: ret i32 [[TMP3]]
;
%2 = load i32, ptr @arr, align 16
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
%5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
%6 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
%7 = call i32 @llvm.smax.i32(i32 %6, i32 %4)
%8 = call i32 @llvm.smax.i32(i32 %7, i32 %5)
ret i32 %8
}
define i32 @smax_v8i32(i32) {
; CHECK-LABEL: @smax_v8i32(
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @arr, align 16
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
; CHECK-NEXT: ret i32 [[TMP3]]
;
%2 = load i32, ptr @arr, align 16
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
%5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
%6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
%7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
%8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
%9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
%10 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
%11 = call i32 @llvm.smax.i32(i32 %10, i32 %4)
%12 = call i32 @llvm.smax.i32(i32 %11, i32 %5)
%13 = call i32 @llvm.smax.i32(i32 %12, i32 %6)
%14 = call i32 @llvm.smax.i32(i32 %13, i32 %7)
%15 = call i32 @llvm.smax.i32(i32 %14, i32 %8)
%16 = call i32 @llvm.smax.i32(i32 %15, i32 %9)
ret i32 %16
}
define i32 @smax_v16i32(i32) {
; CHECK-LABEL: @smax_v16i32(
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @arr, align 16
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])
; CHECK-NEXT: ret i32 [[TMP3]]
;
%2 = load i32, ptr @arr, align 16
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
%5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
%6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
%7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
%8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
%9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
%10 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 8), align 16
%11 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 9), align 4
%12 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 10), align 8
%13 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 11), align 4
%14 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 12), align 16
%15 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 13), align 4
%16 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 14), align 8
%17 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 15), align 4
%18 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
%19 = call i32 @llvm.smax.i32(i32 %18, i32 %4)
%20 = call i32 @llvm.smax.i32(i32 %19, i32 %5)
%21 = call i32 @llvm.smax.i32(i32 %20, i32 %6)
%22 = call i32 @llvm.smax.i32(i32 %21, i32 %7)
%23 = call i32 @llvm.smax.i32(i32 %22, i32 %8)
%24 = call i32 @llvm.smax.i32(i32 %23, i32 %9)
%25 = call i32 @llvm.smax.i32(i32 %24, i32 %10)
%26 = call i32 @llvm.smax.i32(i32 %25, i32 %11)
%27 = call i32 @llvm.smax.i32(i32 %26, i32 %12)
%28 = call i32 @llvm.smax.i32(i32 %27, i32 %13)
%29 = call i32 @llvm.smax.i32(i32 %28, i32 %14)
%30 = call i32 @llvm.smax.i32(i32 %29, i32 %15)
%31 = call i32 @llvm.smax.i32(i32 %30, i32 %16)
%32 = call i32 @llvm.smax.i32(i32 %31, i32 %17)
ret i32 %32
}