This reverts commit 1387a13e1d.
This introduced performance regressions on AArch64, when the cost of a
vector GEP + extracts is offset by the benefits of vectorizing the rest
of the tree.
The test in llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
illustrates the issue. It was extracted from code that regressed a SPEC
benchmark by 15%.
61 lines
3.2 KiB
LLVM
61 lines
3.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: opt -mtriple=arm64-apple-ios -S -passes=slp-vectorizer < %s | FileCheck %s
|
|
; vectorization requires a vector GEP + extracts, but the cost is offset by being able to efficiently vectorize the rest of the tree
|
|
|
|
define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) {
|
|
; CHECK-LABEL: define void @should_vectorize_gep
|
|
; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) {
|
|
; CHECK-NEXT: bb:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE1]], align 2
|
|
; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[TMP0]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[BASE2]], align 2
|
|
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
|
|
; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
|
|
; CHECK-NEXT: [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP5]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
|
|
; CHECK-NEXT: [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP6]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
|
|
; CHECK-NEXT: [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP7]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
|
|
; CHECK-NEXT: [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP8]]
|
|
; CHECK-NEXT: call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
bb:
|
|
%load1 = load i32, ptr %base1, align 2
|
|
%zext1 = zext i32 %load1 to i64
|
|
%load2 = load i32, ptr %base2, align 2
|
|
%zext2 = zext i32 %load2 to i64
|
|
%sub = sub i64 %zext1, %zext2
|
|
%getelementptr.res.1 = getelementptr i32, ptr %base_gep, i64 %sub
|
|
%getelementptr1 = getelementptr i32, ptr %base1, i64 1
|
|
%getelementptr2 = getelementptr i32, ptr %base2, i64 1
|
|
%load3 = load i32, ptr %getelementptr1, align 2
|
|
%zext3 = zext i32 %load3 to i64
|
|
%load4 = load i32, ptr %getelementptr2, align 2
|
|
%zext4= zext i32 %load4 to i64
|
|
%sub2 = sub i64 %zext3, %zext4
|
|
%getelementptr.res.2 = getelementptr i32, ptr %base_gep, i64 %sub2
|
|
%getelementptr3 = getelementptr i32, ptr %base1, i64 2
|
|
%getelementptr4 = getelementptr i32, ptr %base2, i64 2
|
|
%load5 = load i32, ptr %getelementptr3, align 2
|
|
%zext5 = zext i32 %load5 to i64
|
|
%load6 = load i32, ptr %getelementptr4, align 2
|
|
%zext6 = zext i32 %load6 to i64
|
|
%sub3 = sub i64 %zext5, %zext6
|
|
%getelementptr.res.3 = getelementptr i32, ptr %base_gep, i64 %sub3
|
|
%getelementptr5 = getelementptr i32, ptr %base1, i64 3
|
|
%getelementptr6 = getelementptr i32, ptr %base2, i64 3
|
|
%load7 = load i32, ptr %getelementptr5, align 2
|
|
%zext7 = zext i32 %load7 to i64
|
|
%load8 = load i32, ptr %getelementptr6, align 2
|
|
%zext8 = zext i32 %load8 to i64
|
|
%sub4 = sub i64 %zext7, %zext8
|
|
%getelementptr.res.4 = getelementptr i32, ptr %base_gep, i64 %sub4
|
|
call void @use_4(ptr %getelementptr.res.1, ptr %getelementptr.res.2, ptr %getelementptr.res.3, ptr %getelementptr.res.4)
|
|
ret void
|
|
}
|
|
|
|
declare void @use_4(ptr, ptr, ptr, ptr)
|