Instead of abstract cost of the scalar reduction ops, try to use the cost of actual reduction operation instructions, where possible. Also, remove the estimation of the vectorized GEPs pointers for reduced loads, since it is already handled in the tree. Differential Revision: https://reviews.llvm.org/D148036
118 lines
6.4 KiB
LLVM
118 lines
6.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
|
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=x86-64-v2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE4
|
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
|
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
|
|
|
|
@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
|
|
|
|
declare i32 @llvm.smax.i32(i32, i32)
|
|
|
|
define i32 @smax_v2i32(i32) {
|
|
; CHECK-LABEL: @smax_v2i32(
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16
|
|
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
|
|
; CHECK-NEXT: ret i32 [[TMP4]]
|
|
;
|
|
%2 = load i32, ptr @arr, align 16
|
|
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
|
|
%4 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
|
|
ret i32 %4
|
|
}
|
|
|
|
define i32 @smax_v4i32(i32) {
|
|
; SSE2-LABEL: @smax_v4i32(
|
|
; SSE2-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16
|
|
; SSE2-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
|
|
; SSE2-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
|
|
; SSE2-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
|
|
; SSE2-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
|
|
; SSE2-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
|
|
; SSE2-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
|
|
; SSE2-NEXT: ret i32 [[TMP8]]
|
|
;
|
|
; SSE4-LABEL: @smax_v4i32(
|
|
; SSE4-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
|
|
; SSE4-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
|
|
; SSE4-NEXT: ret i32 [[TMP3]]
|
|
;
|
|
; AVX-LABEL: @smax_v4i32(
|
|
; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
|
|
; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
|
|
; AVX-NEXT: ret i32 [[TMP3]]
|
|
;
|
|
%2 = load i32, ptr @arr, align 16
|
|
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
|
|
%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
|
|
%5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
|
|
%6 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
|
|
%7 = call i32 @llvm.smax.i32(i32 %6, i32 %4)
|
|
%8 = call i32 @llvm.smax.i32(i32 %7, i32 %5)
|
|
ret i32 %8
|
|
}
|
|
|
|
define i32 @smax_v8i32(i32) {
|
|
; CHECK-LABEL: @smax_v8i32(
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr @arr, align 16
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]])
|
|
; CHECK-NEXT: ret i32 [[TMP3]]
|
|
;
|
|
%2 = load i32, ptr @arr, align 16
|
|
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
|
|
%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
|
|
%5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
|
|
%6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
|
|
%7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
|
|
%8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
|
|
%9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
|
|
%10 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
|
|
%11 = call i32 @llvm.smax.i32(i32 %10, i32 %4)
|
|
%12 = call i32 @llvm.smax.i32(i32 %11, i32 %5)
|
|
%13 = call i32 @llvm.smax.i32(i32 %12, i32 %6)
|
|
%14 = call i32 @llvm.smax.i32(i32 %13, i32 %7)
|
|
%15 = call i32 @llvm.smax.i32(i32 %14, i32 %8)
|
|
%16 = call i32 @llvm.smax.i32(i32 %15, i32 %9)
|
|
ret i32 %16
|
|
}
|
|
|
|
define i32 @smax_v16i32(i32) {
|
|
; CHECK-LABEL: @smax_v16i32(
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @arr, align 16
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])
|
|
; CHECK-NEXT: ret i32 [[TMP3]]
|
|
;
|
|
%2 = load i32, ptr @arr, align 16
|
|
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
|
|
%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
|
|
%5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
|
|
%6 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
|
|
%7 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 5), align 4
|
|
%8 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
|
|
%9 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
|
|
%10 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 8), align 16
|
|
%11 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 9), align 4
|
|
%12 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 10), align 8
|
|
%13 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 11), align 4
|
|
%14 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 12), align 16
|
|
%15 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 13), align 4
|
|
%16 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 14), align 8
|
|
%17 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 15), align 4
|
|
%18 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
|
|
%19 = call i32 @llvm.smax.i32(i32 %18, i32 %4)
|
|
%20 = call i32 @llvm.smax.i32(i32 %19, i32 %5)
|
|
%21 = call i32 @llvm.smax.i32(i32 %20, i32 %6)
|
|
%22 = call i32 @llvm.smax.i32(i32 %21, i32 %7)
|
|
%23 = call i32 @llvm.smax.i32(i32 %22, i32 %8)
|
|
%24 = call i32 @llvm.smax.i32(i32 %23, i32 %9)
|
|
%25 = call i32 @llvm.smax.i32(i32 %24, i32 %10)
|
|
%26 = call i32 @llvm.smax.i32(i32 %25, i32 %11)
|
|
%27 = call i32 @llvm.smax.i32(i32 %26, i32 %12)
|
|
%28 = call i32 @llvm.smax.i32(i32 %27, i32 %13)
|
|
%29 = call i32 @llvm.smax.i32(i32 %28, i32 %14)
|
|
%30 = call i32 @llvm.smax.i32(i32 %29, i32 %15)
|
|
%31 = call i32 @llvm.smax.i32(i32 %30, i32 %16)
|
|
%32 = call i32 @llvm.smax.i32(i32 %31, i32 %17)
|
|
ret i32 %32
|
|
}
|