getIntrinsicInstrCost takes a IntrinsicCostAttributes holding various parameters of the intrinsic being costed. It can either be called with a scalar intrinsic (RetTy==Scalar, VF==1), with a vector instruction (RetTy==Vector, VF==1) or from the vectorizer with a scalar type and vector width (RetTy==Scalar, VF>1). A RetTy==Vector, VF>1 is considered an error. Both of the vector modes are expected to be treated the same, but because this is confusing many backends end up getting it wrong. Instead of trying work with those two values separately this removes the VF parameter, widening the RetTy/ArgTys by VF used called from the vectorizer. This keeps things simpler, but does require some other modifications to keep things consistent. Most backends look like this will be an improvement (or were not using getIntrinsicInstrCost). AMDGPU needed the most changes to keep the code fromc230965ccfworking. ARM removed the fix indfac521da1, webassembly happens to get a fixup for an SLP cost issue and both X86 and AArch64 seem to now be using better costs from the vectorizer. Differential Revision: https://reviews.llvm.org/D95291
42 lines
1.8 KiB
LLVM
42 lines
1.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s
|
|
|
|
; Regression test for a bug in the SLP vectorizer that was causing
|
|
; these rotates to be incorrectly combined into a vector rotate.
|
|
|
|
target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
|
|
target triple = "wasm32-unknown-unknown"
|
|
|
|
define void @foo(<2 x i64> %x, <4 x i32> %y, i64* %out) #0 {
|
|
; CHECK-LABEL: @foo(
|
|
; CHECK-NEXT: [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0
|
|
; CHECK-NEXT: [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
|
|
; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[B]] to i64
|
|
; CHECK-NEXT: [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]])
|
|
; CHECK-NEXT: store i64 [[C]], i64* [[OUT:%.*]], align 8
|
|
; CHECK-NEXT: [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1
|
|
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3
|
|
; CHECK-NEXT: [[CONV17:%.*]] = zext i32 [[E]] to i64
|
|
; CHECK-NEXT: [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]])
|
|
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[OUT]], i32 1
|
|
; CHECK-NEXT: store i64 [[F]], i64* [[ARRAYIDX2]], align 8
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%a = extractelement <2 x i64> %x, i32 0
|
|
%b = extractelement <4 x i32> %y, i32 2
|
|
%conv6 = zext i32 %b to i64
|
|
%c = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %conv6)
|
|
store i64 %c, i64* %out
|
|
%d = extractelement <2 x i64> %x, i32 1
|
|
%e = extractelement <4 x i32> %y, i32 3
|
|
%conv17 = zext i32 %e to i64
|
|
%f = tail call i64 @llvm.fshl.i64(i64 %d, i64 %d, i64 %conv17)
|
|
%arrayidx2 = getelementptr inbounds i64, i64* %out, i32 1
|
|
store i64 %f, i64* %arrayidx2
|
|
ret void
|
|
}
|
|
|
|
declare i64 @llvm.fshl.i64(i64, i64, i64)
|
|
|
|
attributes #0 = {"target-cpu"="generic" "target-features"="+simd128"}
|