If the masked gathers can be reordered, it may produce strided access pattern and the reordering does not affect common reodering, better to try to reorder masked gathers for better performance. Differential Revision: https://reviews.llvm.org/D157009
60 lines
3.1 KiB
LLVM
60 lines
3.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=riscv64-unknown-linux -mattr=+v | FileCheck %s
|
|
|
|
define i32 @sum_of_abs(ptr noalias %a, ptr noalias %b) {
|
|
; CHECK-LABEL: define i32 @sum_of_abs
|
|
; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[A]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <8 x ptr> [[TMP1]], <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP2]], i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> poison)
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP3]], i1 false)
|
|
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]])
|
|
; CHECK-NEXT: ret i32 [[TMP6]]
|
|
;
|
|
entry:
|
|
%0 = load i8, ptr %a, align 1
|
|
%spec.select.i = tail call i8 @llvm.abs.i8(i8 %0, i1 false)
|
|
%conv = sext i8 %spec.select.i to i32
|
|
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 64
|
|
%1 = load i8, ptr %arrayidx.1, align 1
|
|
%spec.select.i.1 = tail call i8 @llvm.abs.i8(i8 %1, i1 false)
|
|
%conv.1 = sext i8 %spec.select.i.1 to i32
|
|
%add.1 = add nsw i32 %conv, %conv.1
|
|
%arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 128
|
|
%2 = load i8, ptr %arrayidx.2, align 1
|
|
%spec.select.i.2 = tail call i8 @llvm.abs.i8(i8 %2, i1 false)
|
|
%conv.2 = sext i8 %spec.select.i.2 to i32
|
|
%add.2 = add nsw i32 %add.1, %conv.2
|
|
%arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 192
|
|
%3 = load i8, ptr %arrayidx.3, align 1
|
|
%spec.select.i.3 = tail call i8 @llvm.abs.i8(i8 %3, i1 false)
|
|
%conv.3 = sext i8 %spec.select.i.3 to i32
|
|
%add.3 = add nsw i32 %add.2, %conv.3
|
|
%arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 256
|
|
%4 = load i8, ptr %arrayidx.4, align 1
|
|
%spec.select.i.4 = tail call i8 @llvm.abs.i8(i8 %4, i1 false)
|
|
%conv.4 = sext i8 %spec.select.i.4 to i32
|
|
%add.4 = add nsw i32 %add.3, %conv.4
|
|
%arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 320
|
|
%5 = load i8, ptr %arrayidx.5, align 1
|
|
%spec.select.i.5 = tail call i8 @llvm.abs.i8(i8 %5, i1 false)
|
|
%conv.5 = sext i8 %spec.select.i.5 to i32
|
|
%add.5 = add nsw i32 %add.4, %conv.5
|
|
%arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 384
|
|
%6 = load i8, ptr %arrayidx.6, align 1
|
|
%spec.select.i.6 = tail call i8 @llvm.abs.i8(i8 %6, i1 false)
|
|
%conv.6 = sext i8 %spec.select.i.6 to i32
|
|
%add.6 = add nsw i32 %add.5, %conv.6
|
|
%arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 448
|
|
%7 = load i8, ptr %arrayidx.7, align 1
|
|
%spec.select.i.7 = tail call i8 @llvm.abs.i8(i8 %7, i1 false)
|
|
%conv.7 = sext i8 %spec.select.i.7 to i32
|
|
%add.7 = add nsw i32 %add.6, %conv.7
|
|
ret i32 %add.7
|
|
}
|
|
|
|
declare i8 @llvm.abs.i8(i8, i1 immarg)
|