If the vector of loads can be vectorized as masked gather and there are several other masked gather nodes, compiler can try to attempt to check, if it possible to gather such nodes into big consecutive/strided loads node, which provide better performance. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/110151
31 lines
1.8 KiB
LLVM
31 lines
1.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v -slp-threshold=-11 < %s | FileCheck %s
|
|
|
|
define <4 x i32> @test(<2 x i64> %v, ptr %p) {
|
|
; CHECK-LABEL: define <4 x i32> @test(
|
|
; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
|
|
; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
|
|
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
|
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
|
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
|
|
;
|
|
entry:
|
|
%0 = extractelement <2 x i64> %v, i32 1
|
|
%arrayidx127.2 = getelementptr i16, ptr %p, i64 %0
|
|
%1 = load i16, ptr %arrayidx127.2, align 2
|
|
%conv128.2 = zext i16 %1 to i32
|
|
%2 = extractelement <2 x i64> %v, i32 0
|
|
%arrayidx127.3 = getelementptr i16, ptr %p, i64 %2
|
|
%3 = load i16, ptr %arrayidx127.3, align 2
|
|
%conv128.3 = zext i16 %3 to i32
|
|
%4 = insertelement <4 x i32> zeroinitializer, i32 %conv128.2, i32 0
|
|
%5 = insertelement <4 x i32> %4, i32 %conv128.3, i32 1
|
|
ret <4 x i32> %5
|
|
}
|