Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
Alexey Bataev a65a5feb1a [SLP]Improve masked loads vectorization, attempting gathered loads
If the vector of loads can be vectorized as masked gather and there are
several other masked gather nodes, compiler can try to attempt to check,
if it possible to gather such nodes into big consecutive/strided loads
  node, which provide better performance.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/110151
2024-10-08 16:43:10 -04:00

31 lines
1.8 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v -slp-threshold=-11 < %s | FileCheck %s
define <4 x i32> @test(<2 x i64> %v, ptr %p) {
; CHECK-LABEL: define <4 x i32> @test(
; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
; CHECK-NEXT: ret <4 x i32> [[TMP5]]
;
entry:
%0 = extractelement <2 x i64> %v, i32 1
%arrayidx127.2 = getelementptr i16, ptr %p, i64 %0
%1 = load i16, ptr %arrayidx127.2, align 2
%conv128.2 = zext i16 %1 to i32
%2 = extractelement <2 x i64> %v, i32 0
%arrayidx127.3 = getelementptr i16, ptr %p, i64 %2
%3 = load i16, ptr %arrayidx127.3, align 2
%conv128.3 = zext i16 %3 to i32
%4 = insertelement <4 x i32> zeroinitializer, i32 %conv128.2, i32 0
%5 = insertelement <4 x i32> %4, i32 %conv128.3, i32 1
ret <4 x i32> %5
}