Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
Benjamin Kramer 733c7fc55d [NVPTX] Turn on Loop/SLP vectorization
Since PTX has grown a <2 x half> datatype vectorization has become more
important. The late LoadStoreVectorizer intentionally only does loads
and stores, but now arithmetic has to be vectorized for optimal
throughput too.

This is still very limited, SLP vectorization happily creates <2 x half>
if it's a legal type but there's still a lot of register moving
happening to get that fed into a vectorized store. Overall it's a small
performance win by reducing the amount of arithmetic instructions.

I haven't really checked what the loop vectorizer does to PTX code, the
cost model there might need some more tweaks. I didn't see it causing
harm though.

Differential Revision: https://reviews.llvm.org/D46130

llvm-svn: 331035
2018-04-27 13:36:05 +00:00

41 lines
1.6 KiB
LLVM

; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 | FileCheck %s
; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_40 | FileCheck %s -check-prefix=NOVECTOR
; CHECK-LABEL: @fusion
; CHECK: load <2 x half>, <2 x half>*
; CHECK: fmul fast <2 x half>
; CHECK: fadd fast <2 x half>
; CHECK: store <2 x half> %4, <2 x half>
; NOVECTOR-LABEL: @fusion
; NOVECTOR: load half
; NOVECTOR: fmul fast half
; NOVECTOR: fadd fast half
; NOVECTOR: fmul fast half
; NOVECTOR: fadd fast half
; NOVECTOR: store half
define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %arg, i8* noalias nocapture readonly align 256 dereferenceable(19267584) %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 {
%tmp = shl nuw nsw i32 %arg2, 6
%tmp4 = or i32 %tmp, %arg3
%tmp5 = shl nuw nsw i32 %tmp4, 2
%tmp6 = zext i32 %tmp5 to i64
%tmp7 = or i64 %tmp6, 1
%tmp10 = bitcast i8* %arg1 to half*
%tmp11 = getelementptr inbounds half, half* %tmp10, i64 %tmp6
%tmp12 = load half, half* %tmp11, align 8
%tmp13 = fmul fast half %tmp12, 0xH5380
%tmp14 = fadd fast half %tmp13, 0xH57F0
%tmp15 = bitcast i8* %arg to half*
%tmp16 = getelementptr inbounds half, half* %tmp15, i64 %tmp6
store half %tmp14, half* %tmp16, align 8
%tmp17 = getelementptr inbounds half, half* %tmp10, i64 %tmp7
%tmp18 = load half, half* %tmp17, align 2
%tmp19 = fmul fast half %tmp18, 0xH5380
%tmp20 = fadd fast half %tmp19, 0xH57F0
%tmp21 = getelementptr inbounds half, half* %tmp15, i64 %tmp7
store half %tmp20, half* %tmp21, align 2
ret void
}
attributes #0 = { nounwind }