Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
David Green bd4b61efbd [CostModel] Remove VF from IntrinsicCostAttributes
getIntrinsicInstrCost takes a IntrinsicCostAttributes holding various
parameters of the intrinsic being costed. It can either be called with a
scalar intrinsic (RetTy==Scalar, VF==1), with a vector instruction
(RetTy==Vector, VF==1) or from the vectorizer with a scalar type and
vector width (RetTy==Scalar, VF>1). A RetTy==Vector, VF>1 is considered
an error. Both of the vector modes are expected to be treated the same,
but because this is confusing many backends end up getting it wrong.

Instead of trying work with those two values separately this removes the
VF parameter, widening the RetTy/ArgTys by VF used called from the
vectorizer. This keeps things simpler, but does require some other
modifications to keep things consistent.

Most backends look like this will be an improvement (or were not using
getIntrinsicInstrCost). AMDGPU needed the most changes to keep the code
from c230965ccf working. ARM removed the fix in
dfac521da1, webassembly happens to get a fixup for an SLP cost
issue and both X86 and AArch64 seem to now be using better costs from
the vectorizer.

Differential Revision: https://reviews.llvm.org/D95291
2021-02-23 13:03:26 +00:00

42 lines
1.8 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s
; Regression test for a bug in the SLP vectorizer that was causing
; these rotates to be incorrectly combined into a vector rotate.
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
target triple = "wasm32-unknown-unknown"
define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 {
; CHECK-LABEL: @foo(
; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0
; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64
; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]])
; CHECK-NEXT: store i64 [[C]], i64* [[OUT:%.*]], align 8
; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3
; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64
; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]])
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1
; CHECK-NEXT: store i64 [[F]], i64* [[ARRAYIDX2]], align 8
; CHECK-NEXT: ret void
;
%a = extractelement <2 x i64> %x, i32 0
%b = extractelement <4 x i32> %y, i32 2
%conv6 = zext i32 %b to i64
%c = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %conv6)
store i64 %c, i64* %out
%d = extractelement <2 x i64> %x, i32 1
%e = extractelement <4 x i32> %y, i32 3
%conv17 = zext i32 %e to i64
%f = tail call i64 @llvm.fshl.i64(i64 %d, i64 %d, i64 %conv17)
%arrayidx2 = getelementptr inbounds i64, i64* %out, i32 1
store i64 %f, i64* %arrayidx2
ret void
}
declare i64 @llvm.fshl.i64(i64, i64, i64)
attributes #0 = {"target-cpu"="generic" "target-features"="+simd128"}