Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
Philip Reames 2c7786e94a Prefer use of 0.0 over -0.0 for fadd reductions w/nsz (in IR) (#106770)
This is a follow up to 924907bc6, and is mostly motivated by consistency
but does include one additional optimization. In general, we prefer 0.0
over -0.0 as the identity value for an fadd. We use that value in
several places, but don't in others. So, let's be consistent and use the
same identity (when nsz allows) everywhere.

This creates a bunch of test churn, but due to 924907bc6, most of that
churn doesn't actually indicate a change in codegen. The exception is
that this change enables the use of 0.0 for nsz, but *not* reasoc, fadd
reductions. Or said differently, it allows the neutral value of an
ordered fadd reduction to be 0.0.
2024-09-03 09:16:37 -07:00

114 lines
6.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64 -passes=slp-vectorizer -S -mcpu=skylake-avx512 | FileCheck %s
; The test represents the case with multiple vectorization possibilities
; but the most effective way to vectorize it is to match both 8-way reductions
; feeding the insertelement vector build sequence.
declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg, <2 x i1>)
define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
; CHECK-NEXT: ret void
;
entry:
%gep1.0 = getelementptr inbounds double, ptr %arg, i64 1
%ld1.0 = load double, ptr %gep1.0, align 8
%ld0.0 = load double, ptr %arg1, align 8
%mul1.0 = fmul fast double %ld0.0, %ld1.0
%gep2.0 = getelementptr inbounds double, ptr %arg1, i64 16
%ld2.0 = load double, ptr %gep2.0, align 8
%mul2.0 = fmul fast double %ld2.0, %ld1.0
%gep1.1 = getelementptr inbounds double, ptr %arg, i64 3
%ld1.1 = load double, ptr %gep1.1, align 8
%gep0.1 = getelementptr inbounds double, ptr %arg1, i64 1
%ld0.1 = load double, ptr %gep0.1, align 8
%mul1.1 = fmul fast double %ld0.1, %ld1.1
%rdx1.0 = fadd fast double %mul1.0, %mul1.1
%gep2.1 = getelementptr inbounds double, ptr %arg1, i64 17
%ld2.1 = load double, ptr %gep2.1, align 8
%mul2.1 = fmul fast double %ld2.1, %ld1.1
%rdx2.0 = fadd fast double %mul2.0, %mul2.1
%gep1.2 = getelementptr inbounds double, ptr %arg, i64 5
%ld1.2 = load double, ptr %gep1.2, align 8
%gep0.2 = getelementptr inbounds double, ptr %arg1, i64 2
%ld0.2 = load double, ptr %gep0.2, align 8
%mul1.2 = fmul fast double %ld0.2, %ld1.2
%rdx1.1 = fadd fast double %rdx1.0, %mul1.2
%gep2.2 = getelementptr inbounds double, ptr %arg1, i64 18
%ld2.2 = load double, ptr %gep2.2, align 8
%mul2.2 = fmul fast double %ld2.2, %ld1.2
%rdx2.1 = fadd fast double %rdx2.0, %mul2.2
%gep1.3 = getelementptr inbounds double, ptr %arg, i64 7
%ld1.3 = load double, ptr %gep1.3, align 8
%gep0.3 = getelementptr inbounds double, ptr %arg1, i64 3
%ld0.3 = load double, ptr %gep0.3, align 8
%mul1.3 = fmul fast double %ld0.3, %ld1.3
%rdx1.2 = fadd fast double %rdx1.1, %mul1.3
%gep2.3 = getelementptr inbounds double, ptr %arg1, i64 19
%ld2.3 = load double, ptr %gep2.3, align 8
%mul2.3 = fmul fast double %ld2.3, %ld1.3
%rdx2.2 = fadd fast double %rdx2.1, %mul2.3
%gep1.4 = getelementptr inbounds double, ptr %arg, i64 9
%ld1.4 = load double, ptr %gep1.4, align 8
%gep0.4 = getelementptr inbounds double, ptr %arg1, i64 4
%ld0.4 = load double, ptr %gep0.4, align 8
%mul1.4 = fmul fast double %ld0.4, %ld1.4
%rdx1.3 = fadd fast double %rdx1.2, %mul1.4
%gep2.4 = getelementptr inbounds double, ptr %arg1, i64 20
%ld2.4 = load double, ptr %gep2.4, align 8
%mul2.4 = fmul fast double %ld2.4, %ld1.4
%rdx2.3 = fadd fast double %rdx2.2, %mul2.4
%gep1.5 = getelementptr inbounds double, ptr %arg, i64 11
%ld1.5 = load double, ptr %gep1.5, align 8
%gep0.5 = getelementptr inbounds double, ptr %arg1, i64 5
%ld0.5 = load double, ptr %gep0.5, align 8
%mul1.5 = fmul fast double %ld0.5, %ld1.5
%rdx1.4 = fadd fast double %rdx1.3, %mul1.5
%gep2.5 = getelementptr inbounds double, ptr %arg1, i64 21
%ld2.5 = load double, ptr %gep2.5, align 8
%mul2.5 = fmul fast double %ld2.5, %ld1.5
%rdx2.4 = fadd fast double %rdx2.3, %mul2.5
%gep1.6 = getelementptr inbounds double, ptr %arg, i64 13
%ld1.6 = load double, ptr %gep1.6, align 8
%gep0.6 = getelementptr inbounds double, ptr %arg1, i64 6
%ld0.6 = load double, ptr %gep0.6, align 8
%mul1.6 = fmul fast double %ld0.6, %ld1.6
%rdx1.5 = fadd fast double %rdx1.4, %mul1.6
%gep2.6 = getelementptr inbounds double, ptr %arg1, i64 22
%ld2.6 = load double, ptr %gep2.6, align 8
%mul2.6 = fmul fast double %ld2.6, %ld1.6
%rdx2.5 = fadd fast double %rdx2.4, %mul2.6
%gep1.7 = getelementptr inbounds double, ptr %arg, i64 15
%ld1.7 = load double, ptr %gep1.7, align 8
%gep0.7 = getelementptr inbounds double, ptr %arg1, i64 7
%ld0.7 = load double, ptr %gep0.7, align 8
%mul1.7 = fmul fast double %ld0.7, %ld1.7
%rdx1 = fadd fast double %rdx1.5, %mul1.7
%gep2.7 = getelementptr inbounds double, ptr %arg1, i64 23
%ld2.7 = load double, ptr %gep2.7, align 8
%mul2.7 = fmul fast double %ld2.7, %ld1.7
%rdx2 = fadd fast double %rdx2.5, %mul2.7
%i142 = insertelement <2 x double> poison, double %rdx1, i64 0
%i143 = insertelement <2 x double> %i142, double %rdx2, i64 1
%p = getelementptr inbounds double, ptr %arg2, <2 x i64> <i64 0, i64 16>
call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %i143, <2 x ptr> %p, i32 8, <2 x i1> <i1 true, i1 true>)
ret void
}