Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
Alexey Bataev 44eca64224 [SLP]Check scalars before trying scheduling.
Need to check the scalars if they can be vectorized before trying to
schedule them. It may save compile time and improve vectorization on
large functions/basic blocks.

Differential Revision: https://reviews.llvm.org/D154891
2023-07-24 09:25:19 -07:00

741 lines
36 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,AVX
;
; This file tests the look-ahead operand reordering heuristic.
;
;
; This checks that operand reordering will reorder the operands of the adds
; by taking into consideration the instructions beyond the immediate
; predecessors.
;
; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1]
; \ / \ / \ / \ /
; - - - -
; \ / \ /
; + +
; | |
; S[0] S[1]
;
define void @lookahead_basic(ptr %array) {
; CHECK-LABEL: @lookahead_basic(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]]
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr [[ARRAY]], align 8
; CHECK-NEXT: ret void
;
entry:
%idx1 = getelementptr inbounds double, ptr %array, i64 1
%idx2 = getelementptr inbounds double, ptr %array, i64 2
%idx3 = getelementptr inbounds double, ptr %array, i64 3
%idx4 = getelementptr inbounds double, ptr %array, i64 4
%idx5 = getelementptr inbounds double, ptr %array, i64 5
%idx6 = getelementptr inbounds double, ptr %array, i64 6
%idx7 = getelementptr inbounds double, ptr %array, i64 7
%A_0 = load double, ptr %array, align 8
%A_1 = load double, ptr %idx1, align 8
%B_0 = load double, ptr %idx2, align 8
%B_1 = load double, ptr %idx3, align 8
%C_0 = load double, ptr %idx4, align 8
%C_1 = load double, ptr %idx5, align 8
%D_0 = load double, ptr %idx6, align 8
%D_1 = load double, ptr %idx7, align 8
%subAB_0 = fsub fast double %A_0, %B_0
%subCD_0 = fsub fast double %C_0, %D_0
%subAB_1 = fsub fast double %A_1, %B_1
%subCD_1 = fsub fast double %C_1, %D_1
%addABCD_0 = fadd fast double %subAB_0, %subCD_0
%addCDAB_1 = fadd fast double %subCD_1, %subAB_1
store double %addABCD_0, ptr %array, align 8
store double %addCDAB_1, ptr %idx1, align 8
ret void
}
; Check whether the look-ahead operand reordering heuristic will avoid
; bundling the alt opcodes. The vectorized code should have no shuffles.
;
; A[0] B[0] A[0] B[0] A[1] A[1] A[1] B[1]
; \ / \ / \ / \ /
; + - - +
; \ / \ /
; + +
; | |
; S[0] S[1]
;
define void @lookahead_alt1(ptr %array) {
; CHECK-LABEL: @lookahead_alt1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 5
; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 7
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]]
; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[ARRAY]], align 8
; CHECK-NEXT: ret void
;
entry:
%idx1 = getelementptr inbounds double, ptr %array, i64 1
%idx2 = getelementptr inbounds double, ptr %array, i64 2
%idx3 = getelementptr inbounds double, ptr %array, i64 3
%idx4 = getelementptr inbounds double, ptr %array, i64 4
%idx5 = getelementptr inbounds double, ptr %array, i64 5
%idx6 = getelementptr inbounds double, ptr %array, i64 6
%idx7 = getelementptr inbounds double, ptr %array, i64 7
%A_0 = load double, ptr %array, align 8
%A_1 = load double, ptr %idx1, align 8
%B_0 = load double, ptr %idx2, align 8
%B_1 = load double, ptr %idx3, align 8
%addAB_0_L = fadd fast double %A_0, %B_0
%subAB_0_R = fsub fast double %A_0, %B_0
%subAB_1_L = fsub fast double %A_1, %B_1
%addAB_1_R = fadd fast double %A_1, %B_1
%addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R
%addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R
store double %addABCD_0, ptr %array, align 8
store double %addCDAB_1, ptr %idx1, align 8
ret void
}
; This code should get vectorized all the way to the loads with shuffles for
; the alt opcodes.
;
; A[0] B[0] C[0] D[0] C[1] D[1] A[1] B[1]
; \ / \ / \ / \ /
; + - + -
; \ / \ /
; + +
; | |
; S[0] S[1]
;
define void @lookahead_alt2(ptr %array) {
; CHECK-LABEL: @lookahead_alt2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 2
; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 4
; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds double, ptr [[ARRAY]], i64 6
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[IDX2]], align 8
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[IDX4]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[IDX6]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]]
; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[ARRAY]], align 8
; CHECK-NEXT: ret void
;
entry:
%idx1 = getelementptr inbounds double, ptr %array, i64 1
%idx2 = getelementptr inbounds double, ptr %array, i64 2
%idx3 = getelementptr inbounds double, ptr %array, i64 3
%idx4 = getelementptr inbounds double, ptr %array, i64 4
%idx5 = getelementptr inbounds double, ptr %array, i64 5
%idx6 = getelementptr inbounds double, ptr %array, i64 6
%idx7 = getelementptr inbounds double, ptr %array, i64 7
%A_0 = load double, ptr %array, align 8
%A_1 = load double, ptr %idx1, align 8
%B_0 = load double, ptr %idx2, align 8
%B_1 = load double, ptr %idx3, align 8
%C_0 = load double, ptr %idx4, align 8
%C_1 = load double, ptr %idx5, align 8
%D_0 = load double, ptr %idx6, align 8
%D_1 = load double, ptr %idx7, align 8
%addAB_0 = fadd fast double %A_0, %B_0
%subCD_0 = fsub fast double %C_0, %D_0
%addCD_1 = fadd fast double %C_1, %D_1
%subAB_1 = fsub fast double %A_1, %B_1
%addABCD_0 = fadd fast double %addAB_0, %subCD_0
%addCDAB_1 = fadd fast double %addCD_1, %subAB_1
store double %addABCD_0, ptr %array, align 8
store double %addCDAB_1, ptr %idx1, align 8
ret void
}
;
; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1]
; \ / \ / / \ / \ /
; - - U - -
; \ / \ /
; + +
; | |
; S[0] S[1]
;
; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses.
; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use.
define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2) {
; CHECK-LABEL: @lookahead_external_uses(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8
; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
; CHECK-NEXT: store double [[TMP10]], ptr [[EXT1:%.*]], align 8
; CHECK-NEXT: ret void
;
entry:
%IdxA1 = getelementptr inbounds double, ptr %A, i64 1
%IdxB2 = getelementptr inbounds double, ptr %B, i64 2
%IdxA2 = getelementptr inbounds double, ptr %A, i64 2
%IdxB1 = getelementptr inbounds double, ptr %B, i64 1
%A0 = load double, ptr %A, align 8
%B0 = load double, ptr %B, align 8
%C0 = load double, ptr %C, align 8
%D0 = load double, ptr %D, align 8
%A1 = load double, ptr %IdxA1, align 8
%B2 = load double, ptr %IdxB2, align 8
%A2 = load double, ptr %IdxA2, align 8
%B1 = load double, ptr %IdxB1, align 8
%subA0B0 = fsub fast double %A0, %B0
%subC0D0 = fsub fast double %C0, %D0
%subA1B2 = fsub fast double %A1, %B2
%subA2B1 = fsub fast double %A2, %B1
%add0 = fadd fast double %subA0B0, %subC0D0
%add1 = fadd fast double %subA1B2, %subA2B1
%IdxS1 = getelementptr inbounds double, ptr %S, i64 1
store double %add0, ptr %S, align 8
store double %add1, ptr %IdxS1, align 8
; External use
store double %A1, ptr %Ext1, align 8
ret void
}
; A[0] B[0] C[0] D[0] A[1] B[2] A[2] B[1]
; \ / \ / / \ / \ / \
; - - U1,U2,U3 - - U4,U5
; \ / \ /
; + +
; | |
; S[0] S[1]
;
;
; If we limit the users budget for the look-ahead heuristic to 2, then the
; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
; over A[1] (with 3 external users).
; The result is that the operands are of the Add not reordered and the loads
; from A get vectorized instead of the loads from B.
;
define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr %Ext1, ptr %Ext2, ptr %Ext3, ptr %Ext4, ptr %Ext5) {
; CHECK-LABEL: @lookahead_limit_users_budget(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 2
; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8
; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
; CHECK-NEXT: store double [[TMP10]], ptr [[EXT1:%.*]], align 8
; CHECK-NEXT: store double [[TMP10]], ptr [[EXT2:%.*]], align 8
; CHECK-NEXT: store double [[TMP10]], ptr [[EXT3:%.*]], align 8
; CHECK-NEXT: store double [[B1]], ptr [[EXT4:%.*]], align 8
; CHECK-NEXT: store double [[B1]], ptr [[EXT5:%.*]], align 8
; CHECK-NEXT: ret void
;
entry:
%IdxA1 = getelementptr inbounds double, ptr %A, i64 1
%IdxB2 = getelementptr inbounds double, ptr %B, i64 2
%IdxA2 = getelementptr inbounds double, ptr %A, i64 2
%IdxB1 = getelementptr inbounds double, ptr %B, i64 1
%A0 = load double, ptr %A, align 8
%B0 = load double, ptr %B, align 8
%C0 = load double, ptr %C, align 8
%D0 = load double, ptr %D, align 8
%A1 = load double, ptr %IdxA1, align 8
%B2 = load double, ptr %IdxB2, align 8
%A2 = load double, ptr %IdxA2, align 8
%B1 = load double, ptr %IdxB1, align 8
%subA0B0 = fsub fast double %A0, %B0
%subC0D0 = fsub fast double %C0, %D0
%subA1B2 = fsub fast double %A1, %B2
%subA2B1 = fsub fast double %A2, %B1
%add0 = fadd fast double %subA0B0, %subC0D0
%add1 = fadd fast double %subA1B2, %subA2B1
%IdxS1 = getelementptr inbounds double, ptr %S, i64 1
store double %add0, ptr %S, align 8
store double %add1, ptr %IdxS1, align 8
; External uses of A1
store double %A1, ptr %Ext1, align 8
store double %A1, ptr %Ext2, align 8
store double %A1, ptr %Ext3, align 8
; External uses of B1
store double %B1, ptr %Ext4, align 8
store double %B1, ptr %Ext5, align 8
ret void
}
; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls).
%Class = type { i8 }
declare double @_ZN1i2ayEv(ptr)
declare double @_ZN1i2axEv()
define void @lookahead_crash(ptr %A, ptr %S, ptr %Arg0) {
; CHECK-LABEL: @lookahead_crash(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(ptr [[ARG0:%.*]])
; CHECK-NEXT: [[C1:%.*]] = call double @_ZN1i2axEv()
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C1]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[S:%.*]], align 8
; CHECK-NEXT: ret void
;
%IdxA1 = getelementptr inbounds double, ptr %A, i64 1
%A0 = load double, ptr %A, align 8
%A1 = load double, ptr %IdxA1, align 8
%C0 = call double @_ZN1i2ayEv(ptr %Arg0)
%C1 = call double @_ZN1i2axEv()
%add0 = fadd fast double %A0, %C0
%add1 = fadd fast double %A1, %C1
%IdxS1 = getelementptr inbounds double, ptr %S, i64 1
store double %add0, ptr %S, align 8
store double %add1, ptr %IdxS1, align 8
ret void
}
; This checks that we choose to group consecutive extracts from the same vectors.
define void @ChecksExtractScores(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2) {
; CHECK-LABEL: @ChecksExtractScores(
; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
; CHECK-NEXT: [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
; CHECK-NEXT: [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
; CHECK-NEXT: ret void
;
%idx1 = getelementptr inbounds double, ptr %array, i64 1
%loadA0 = load double, ptr %array, align 4
%loadA1 = load double, ptr %idx1, align 4
%loadVec = load <2 x double>, ptr %vecPtr1, align 4
%extrA0 = extractelement <2 x double> %loadVec, i32 0
%extrA1 = extractelement <2 x double> %loadVec, i32 1
%loadVec2 = load <2 x double>, ptr %vecPtr2, align 4
%extrB0 = extractelement <2 x double> %loadVec2, i32 0
%extrB1 = extractelement <2 x double> %loadVec2, i32 1
%mul0 = fmul double %extrA0, %loadA0
%mul1 = fmul double %extrA1, %loadA0
%mul3 = fmul double %extrB0, %loadA1
%mul4 = fmul double %extrB1, %loadA1
%add0 = fadd double %mul0, %mul3
%add1 = fadd double %mul1, %mul4
%sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1
store double %add0, ptr %storeArray, align 8
store double %add1, ptr %sidx1, align 8
ret void
}
define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
; SSE-LABEL: @ExtractIdxNotConstantInt1(
; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
; SSE-NEXT: ret i1 [[CMP_I185]]
;
; AVX-LABEL: @ExtractIdxNotConstantInt1(
; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
; AVX-NEXT: ret i1 [[CMP_I185]]
;
%vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef
%sub14.i167 = fsub float undef, %vecext.i291.i166
%fm = fmul float %a, %sub14.i167
%sub25.i168 = fsub float %fm, %b
%vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
%add36.i173 = fadd float %sub25.i168, 10.0
%mul72.i179 = fmul float %c, %vecext.i276.i169
%add78.i180 = fsub float %mul72.i179, 30.0
%add79.i181 = fadd float 2.0, %add78.i180
%mul123.i184 = fmul float %add36.i173, %add79.i181
%cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
ret i1 %cmp.i185
}
define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
; SSE-LABEL: @ExtractIdxNotConstantInt2(
; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0
; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1
; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
; SSE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
; SSE-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
; SSE-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
; SSE-NEXT: ret i1 [[CMP_I185]]
;
; AVX-LABEL: @ExtractIdxNotConstantInt2(
; AVX-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
; AVX-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
; AVX-NEXT: [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
; AVX-NEXT: [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
; AVX-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
; AVX-NEXT: [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
; AVX-NEXT: [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
; AVX-NEXT: [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
; AVX-NEXT: [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
; AVX-NEXT: [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
; AVX-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
; AVX-NEXT: ret i1 [[CMP_I185]]
;
%vecext.i291.i166 = extractelement <4 x float> %vec, i64 1
%sub14.i167 = fsub float undef, %vecext.i291.i166
%fm = fmul float %a, %sub14.i167
%sub25.i168 = fsub float %fm, %b
%vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
%add36.i173 = fadd float %sub25.i168, 10.0
%mul72.i179 = fmul float %c, %vecext.i276.i169
%add78.i180 = fsub float %mul72.i179, 30.0
%add79.i181 = fadd float 2.0, %add78.i180
%mul123.i184 = fmul float %add36.i173, %add79.i181
%cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
ret i1 %cmp.i185
}
define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
; CHECK-LABEL: @foo(
; CHECK-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
; CHECK-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> <i32 poison, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+01>, float [[B:%.*]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP7]], <float 1.000000e+01, float 2.000000e+00>
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
; CHECK-NEXT: [[MUL123_I184:%.*]] = fmul float [[TMP9]], [[TMP10]]
; CHECK-NEXT: [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
; CHECK-NEXT: ret i1 [[CMP_I185]]
;
%vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
%sub14.i167 = fsub float undef, %vecext.i291.i166
%fm = fmul float %a, %sub14.i167
%sub25.i168 = fsub float %fm, %b
%vecext.i276.i169 = extractelement <4 x float> %vec, i64 1
%add36.i173 = fadd float %sub25.i168, 10.0
%mul72.i179 = fmul float %c, %vecext.i276.i169
%add78.i180 = fsub float %mul72.i179, 30.0
%add79.i181 = fadd float 2.0, %add78.i180
%mul123.i184 = fmul float %add36.i173, %add79.i181
%cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
ret i1 %cmp.i185
}
; Same as @ChecksExtractScores, but the extratelement vector operands do not match.
define void @ChecksExtractScores_different_vectors(ptr %storeArray, ptr %array, ptr %vecPtr1, ptr %vecPtr2, ptr %vecPtr3, ptr %vecPtr4) {
;
; SSE-LABEL: @ChecksExtractScores_different_vectors(
; SSE-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
; SSE-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
; SSE-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4
; SSE-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4
; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY:%.*]], align 4
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[LOADVEC2]], <2 x double> [[LOADVEC3]], <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], [[TMP1]]
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3>
; SSE-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP1]]
; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]]
; SSE-NEXT: store <2 x double> [[TMP7]], ptr [[STOREARRAY:%.*]], align 8
; SSE-NEXT: ret void
;
; AVX-LABEL: @ChecksExtractScores_different_vectors(
; AVX-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, ptr [[ARRAY:%.*]], i64 1
; AVX-NEXT: [[LOADA0:%.*]] = load double, ptr [[ARRAY]], align 4
; AVX-NEXT: [[LOADA1:%.*]] = load double, ptr [[IDX1]], align 4
; AVX-NEXT: [[LOADVEC:%.*]] = load <2 x double>, ptr [[VECPTR1:%.*]], align 4
; AVX-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, ptr [[VECPTR2:%.*]], align 4
; AVX-NEXT: [[LOADVEC3:%.*]] = load <2 x double>, ptr [[VECPTR3:%.*]], align 4
; AVX-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, ptr [[VECPTR4:%.*]], align 4
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[LOADVEC]], <2 x double> [[LOADVEC2]], <2 x i32> <i32 0, i32 3>
; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
; AVX-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[LOADVEC3]], <2 x double> [[LOADVEC4]], <2 x i32> <i32 0, i32 3>
; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[LOADA1]], i32 0
; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
; AVX-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
; AVX-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]]
; AVX-NEXT: store <2 x double> [[TMP9]], ptr [[STOREARRAY:%.*]], align 8
; AVX-NEXT: ret void
;
%idx1 = getelementptr inbounds double, ptr %array, i64 1
%loadA0 = load double, ptr %array, align 4
%loadA1 = load double, ptr %idx1, align 4
%loadVec = load <2 x double>, ptr %vecPtr1, align 4
%loadVec2 = load <2 x double>, ptr %vecPtr2, align 4
%extrA0 = extractelement <2 x double> %loadVec, i32 0
%extrA1 = extractelement <2 x double> %loadVec2, i32 1
%loadVec3= load <2 x double>, ptr %vecPtr3, align 4
%loadVec4 = load <2 x double>, ptr %vecPtr4, align 4
%extrB0 = extractelement <2 x double> %loadVec3, i32 0
%extrB1 = extractelement <2 x double> %loadVec4, i32 1
%mul0 = fmul double %extrA0, %loadA0
%mul1 = fmul double %extrA1, %loadA0
%mul3 = fmul double %extrB0, %loadA1
%mul4 = fmul double %extrB1, %loadA1
%add0 = fadd double %mul0, %mul3
%add1 = fadd double %mul1, %mul4
%sidx1 = getelementptr inbounds double, ptr %storeArray, i64 1
store double %add0, ptr %storeArray, align 8
store double %add1, ptr %sidx1, align 8
ret void
}
; This checks that we we prefer splats rather than reverse load vectors + shuffles.
; 2-wide splat loads in x86 use a single instruction so they are quite cheap.
define double @splat_loads(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
; SSE-LABEL: @splat_loads(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
; SSE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
; SSE-NEXT: [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]]
; SSE-NEXT: ret double [[ADD3]]
;
; AVX-LABEL: @splat_loads(
; AVX-NEXT: entry:
; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
; AVX-NEXT: [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
; AVX-NEXT: [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
; AVX-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
; AVX-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
; AVX-NEXT: [[ADD3:%.*]] = fadd double [[TMP8]], [[TMP9]]
; AVX-NEXT: ret double [[ADD3]]
;
entry:
%gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1
%ld_1_0 = load double, ptr %array1, align 8
%ld_1_1 = load double, ptr %gep_1_1, align 8
%gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1
%ld_2_0 = load double, ptr %array2, align 8
%ld_2_1 = load double, ptr %gep_2_1, align 8
%mul1 = fmul double %ld_1_0, %ld_2_0
%mul2 = fmul double %ld_1_1, %ld_2_0
%mul3 = fmul double %ld_1_0, %ld_2_1
%mul4 = fmul double %ld_1_1, %ld_2_1
%add1 = fadd double %mul1, %mul3
%add2 = fadd double %mul2, %mul4
%add3 = fadd double %add1, %add2
ret double %add3
}
; Same as splat_loads() but the splat load has internal uses in the slp graph.
define double @splat_loads_with_internal_uses(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
; SSE-LABEL: @splat_loads_with_internal_uses(
; SSE-NEXT: entry:
; SSE-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; SSE-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
; SSE-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
; SSE-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
; SSE-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
; SSE-NEXT: [[RES:%.*]] = fadd double [[TMP8]], [[TMP9]]
; SSE-NEXT: ret double [[RES]]
;
; AVX-LABEL: @splat_loads_with_internal_uses(
; AVX-NEXT: entry:
; AVX-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds double, ptr [[ARRAY2:%.*]], i64 1
; AVX-NEXT: [[LD_2_0:%.*]] = load double, ptr [[ARRAY2]], align 8
; AVX-NEXT: [[LD_2_1:%.*]] = load double, ptr [[GEP_2_1]], align 8
; AVX-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LD_2_0]], i32 0
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
; AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[LD_2_1]], i32 0
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
; AVX-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP0]], [[TMP5]]
; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
; AVX-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]]
; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
; AVX-NEXT: [[RES:%.*]] = fadd double [[TMP9]], [[TMP10]]
; AVX-NEXT: ret double [[RES]]
;
entry:
%gep_1_1 = getelementptr inbounds double, ptr %array1, i64 1
%ld_1_0 = load double, ptr %array1, align 8
%ld_1_1 = load double, ptr %gep_1_1, align 8
%gep_2_1 = getelementptr inbounds double, ptr %array2, i64 1
%ld_2_0 = load double, ptr %array2, align 8
%ld_2_1 = load double, ptr %gep_2_1, align 8
%mul1 = fmul double %ld_1_0, %ld_2_0
%mul2 = fmul double %ld_1_1, %ld_2_0
%mul3 = fmul double %ld_1_0, %ld_2_1
%mul4 = fmul double %ld_1_1, %ld_2_1
%add1 = fadd double %mul1, %mul3
%add2 = fadd double %mul2, %mul4
; One more user for the broadcast of %ld_2_0
%sub1 = fsub double %add1, %ld_2_0
%sub2 = fsub double %add2, %ld_2_0
%res = fadd double %sub1, %sub2
ret double %res
}