[SLP] Avoid -passes=instcombine stages in SLP tests (#146257)

Fixes #145511 Note that there are still two instances of --passes=slp-vectorizer,instcombine left unchanged because it seems that the tests are meant to run in conjunction with instcombine and removing instcombine would invalidate their original objective: [llvm/test/Transforms/SLPVectorizer/arith-div-undef.ll](https://github.com/llvm/llvm-project/blob/main/llvm/test/Transforms/SLPVectorizer/arith-div-undef.ll) [llvm/test/Transforms/SLPVectorizer/slp-hr-with-reuse.ll](https://github.com/llvm/llvm-project/blob/main/llvm/test/Transforms/SLPVectorizer/slp-hr-with-reuse.ll)
2025-07-02 06:14:41 -04:00
parent 7ead20db28
commit 6e1e89ee38
38 changed files with 2051 additions and 1528 deletions
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=slp-vectorizer,instcombine -pass-remarks-output=%t | FileCheck %s
+; RUN: opt < %s -S -passes=slp-vectorizer -pass-remarks-output=%t | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=REMARK %s
-; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s
+; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='slp-vectorizer' -pass-remarks-output=%t | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=REMARK %s

 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -16,10 +16,10 @@ target triple = "aarch64--linux-gnu"

 define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: @gather_multiple_use(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP5]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], splat (i32 65535)
@@ -57,22 +57,26 @@ define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
@data = global [6 x [258 x i8]] zeroinitializer, align 1
 define void @gather_load(ptr noalias %ptr) {
 ; CHECK-LABEL: @gather_load(
-; CHECK-NEXT:    [[ARRAYIDX182:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX183:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 6
-; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 8
-; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 258), align 1
+; CHECK-NEXT:    [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX149:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 1, i64 0
+; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr [[ARRAYIDX149]], align 1
 ; CHECK-NEXT:    [[CONV150:%.*]] = zext i8 [[L0]] to i16
-; CHECK-NEXT:    [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10
-; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 517), align 1
+; CHECK-NEXT:    [[ADD152:%.*]] = add i16 10, [[CONV150]]
+; CHECK-NEXT:    [[ARRAYIDX155:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 2, i64 1
+; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[ARRAYIDX155]], align 1
 ; CHECK-NEXT:    [[CONV156:%.*]] = zext i8 [[L1]] to i16
-; CHECK-NEXT:    [[ADD158:%.*]] = add nuw nsw i16 [[CONV156]], 20
-; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 776), align 1
+; CHECK-NEXT:    [[ADD158:%.*]] = add i16 20, [[CONV156]]
+; CHECK-NEXT:    [[ARRAYIDX161:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 3, i64 2
+; CHECK-NEXT:    [[L2:%.*]] = load i8, ptr [[ARRAYIDX161]], align 1
 ; CHECK-NEXT:    [[CONV162:%.*]] = zext i8 [[L2]] to i16
-; CHECK-NEXT:    [[ADD164:%.*]] = add nuw nsw i16 [[CONV162]], 30
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr getelementptr inbounds nuw (i8, ptr @data, i64 1035), align 1
+; CHECK-NEXT:    [[ADD164:%.*]] = add i16 30, [[CONV162]]
+; CHECK-NEXT:    [[ARRAYIDX167:%.*]] = getelementptr inbounds [6 x [258 x i8]], ptr @data, i64 0, i64 4, i64 3
+; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[ARRAYIDX167]], align 1
 ; CHECK-NEXT:    [[CONV168:%.*]] = zext i8 [[L3]] to i16
-; CHECK-NEXT:    [[ADD170:%.*]] = add nuw nsw i16 [[CONV168]], 40
+; CHECK-NEXT:    [[ADD170:%.*]] = add i16 40, [[CONV168]]
 ; CHECK-NEXT:    store i16 [[ADD152]], ptr [[ARRAYIDX182]], align 2
 ; CHECK-NEXT:    store i16 [[ADD158]], ptr [[ARRAYIDX183]], align 2
 ; CHECK-NEXT:    store i16 [[ADD164]], ptr [[ARRAYIDX184]], align 2
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=slp-vectorizer,dce,instcombine < %s | FileCheck %s --check-prefix=GENERIC
-; RUN: opt -S -mcpu=kryo -passes=slp-vectorizer,dce,instcombine < %s | FileCheck %s --check-prefix=KRYO
+; RUN: opt -S -passes=slp-vectorizer < %s | FileCheck %s --check-prefix=GENERIC
+; RUN: opt -S -mcpu=kryo -passes=slp-vectorizer < %s | FileCheck %s --check-prefix=KRYO

 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -36,57 +36,49 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; GENERIC-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; GENERIC-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
 ; GENERIC-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
 ; GENERIC-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; GENERIC-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0
-; GENERIC-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]]
+; GENERIC-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i32 [[TMP5]]
 ; GENERIC-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
 ; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP7]] to i32
-; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; GENERIC-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1
-; GENERIC-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]]
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]]
+; GENERIC-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP8]]
 ; GENERIC-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP10]] to i32
 ; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; GENERIC-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2
-; GENERIC-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP9]]
 ; GENERIC-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
 ; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP13]] to i32
 ; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; GENERIC-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3
-; GENERIC-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]]
+; GENERIC-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP11]]
 ; GENERIC-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP16]] to i32
 ; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; GENERIC-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4
-; GENERIC-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]]
+; GENERIC-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP14]]
 ; GENERIC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
 ; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP19]] to i32
 ; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; GENERIC-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5
-; GENERIC-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP15]]
 ; GENERIC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
 ; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP22]] to i32
 ; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; GENERIC-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6
-; GENERIC-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]]
+; GENERIC-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP17]]
 ; GENERIC-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP25]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7
-; GENERIC-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]]
+; GENERIC-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP20]]
 ; GENERIC-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
 ; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP28]] to i32
 ; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
@@ -109,57 +101,49 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; KRYO-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; KRYO-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
 ; KRYO-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
 ; KRYO-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; KRYO-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0
-; KRYO-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]]
+; KRYO-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i32 [[TMP5]]
 ; KRYO-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
 ; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP7]] to i32
-; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; KRYO-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1
-; KRYO-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]]
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]]
+; KRYO-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP8]]
 ; KRYO-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP10]] to i32
 ; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; KRYO-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2
-; KRYO-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]]
+; KRYO-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP9]]
 ; KRYO-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
 ; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP13]] to i32
 ; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; KRYO-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3
-; KRYO-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]]
+; KRYO-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP11]]
 ; KRYO-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP16]] to i32
 ; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; KRYO-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4
-; KRYO-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
-; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]]
+; KRYO-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP14]]
 ; KRYO-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
 ; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP19]] to i32
 ; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; KRYO-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5
-; KRYO-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
-; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]]
+; KRYO-NEXT:    [[TMP15:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP15]]
 ; KRYO-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
 ; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP22]] to i32
 ; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; KRYO-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6
-; KRYO-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
-; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]]
+; KRYO-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP17]]
 ; KRYO-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP25]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7
-; KRYO-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]]
+; KRYO-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i32 [[TMP20]]
 ; KRYO-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
 ; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP28]] to i32
 ; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
@@ -293,55 +277,55 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
 ; GENERIC-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; GENERIC-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; GENERIC-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
 ; GENERIC-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; GENERIC-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; GENERIC-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0
+; GENERIC-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
+; GENERIC-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
 ; GENERIC-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]]
 ; GENERIC-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
 ; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP7]] to i32
-; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; GENERIC-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]]
+; GENERIC-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
 ; GENERIC-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]]
 ; GENERIC-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP10]] to i32
 ; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; GENERIC-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2
+; GENERIC-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
 ; GENERIC-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]]
 ; GENERIC-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
 ; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP13]] to i32
 ; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; GENERIC-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3
+; GENERIC-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
 ; GENERIC-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]]
 ; GENERIC-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP16]] to i32
 ; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; GENERIC-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4
+; GENERIC-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
 ; GENERIC-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]]
 ; GENERIC-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
 ; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP19]] to i32
 ; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; GENERIC-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5
+; GENERIC-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
 ; GENERIC-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]]
 ; GENERIC-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
 ; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP22]] to i32
 ; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; GENERIC-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6
+; GENERIC-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
 ; GENERIC-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]]
 ; GENERIC-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
 ; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP25]] to i32
 ; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; GENERIC-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7
+; GENERIC-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
 ; GENERIC-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
 ; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]]
 ; GENERIC-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
@@ -366,55 +350,55 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds nuw i8, ptr [[A_ADDR_0101]], i64 16
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
 ; KRYO-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; KRYO-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; KRYO-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
 ; KRYO-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; KRYO-NEXT:    [[TMP4:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP3]]
-; KRYO-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i64 0
+; KRYO-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
+; KRYO-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP4]], i32 0
 ; KRYO-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[G:%.*]], i64 [[TMP6]]
 ; KRYO-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
 ; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP7]] to i32
-; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
-; KRYO-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i64 1
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV3]], [[SUM_0102]]
+; KRYO-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP4]], i32 1
 ; KRYO-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP9]]
 ; KRYO-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP10]] to i32
 ; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
-; KRYO-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i64 2
+; KRYO-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP4]], i32 2
 ; KRYO-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP12]]
 ; KRYO-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX19]], align 2
 ; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP13]] to i32
 ; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
-; KRYO-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i64 3
+; KRYO-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP4]], i32 3
 ; KRYO-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP15]]
 ; KRYO-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP16]] to i32
 ; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
-; KRYO-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i64 4
+; KRYO-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP4]], i32 4
 ; KRYO-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP18]]
 ; KRYO-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX37]], align 2
 ; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP19]] to i32
 ; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
-; KRYO-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i64 5
+; KRYO-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP4]], i32 5
 ; KRYO-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP21]]
 ; KRYO-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX46]], align 2
 ; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP22]] to i32
 ; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
-; KRYO-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i64 6
+; KRYO-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP4]], i32 6
 ; KRYO-NEXT:    [[TMP24:%.*]] = sext i32 [[TMP23]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP24]]
 ; KRYO-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX55]], align 2
 ; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP25]] to i32
 ; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
-; KRYO-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i64 7
+; KRYO-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP4]], i32 7
 ; KRYO-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
 ; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, ptr [[G]], i64 [[TMP27]]
 ; KRYO-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX64]], align 2
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s

 ; These tests check that we remove from consideration pairs of seed
@@ -46,9 +46,9 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -58,29 +58,25 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = zext nneg i32 [[TMP6]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[G:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP6]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
 ; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
@@ -143,7 +139,7 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -153,26 +149,22 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = zext nneg i32 [[TMP4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[G:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[T11]] to i64
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T11]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i64 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T12]], i64 3
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP6]], i64 0)
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;test_i16_extend NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer' -slp-threshold=-5 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s


@@ -25,42 +25,42 @@ define void @test_i16_extend(ptr %p.1, ptr %p.2, i32 %idx.i32) {
 ; CHECK-LABEL: @test_i16_extend(
 ; CHECK-NEXT:    [[P_0:%.*]] = load ptr, ptr @global, align 8
 ; CHECK-NEXT:    [[IDX_0:%.*]] = zext i32 [[IDX_I32:%.*]] to i64
-; CHECK-NEXT:    [[T53:%.*]] = getelementptr inbounds nuw i16, ptr [[P_1:%.*]], i64 [[IDX_0]]
-; CHECK-NEXT:    [[T56:%.*]] = getelementptr inbounds nuw i16, ptr [[P_2:%.*]], i64 [[IDX_0]]
+; CHECK-NEXT:    [[T53:%.*]] = getelementptr inbounds i16, ptr [[P_1:%.*]], i64 [[IDX_0]]
+; CHECK-NEXT:    [[T56:%.*]] = getelementptr inbounds i16, ptr [[P_2:%.*]], i64 [[IDX_0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[T53]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[T56]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[TMP3]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[T60]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; CHECK-NEXT:    [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[T71]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[L_3:%.*]] = load i32, ptr [[T82]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP5]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[L_4:%.*]] = load i32, ptr [[T93]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP5]], i64 4
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
 ; CHECK-NEXT:    [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[L_5:%.*]] = load i32, ptr [[T104]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP5]], i64 5
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
 ; CHECK-NEXT:    [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[L_6:%.*]] = load i32, ptr [[T115]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP5]], i64 6
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP5]], i32 6
 ; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
 ; CHECK-NEXT:    [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[L_7:%.*]] = load i32, ptr [[T126]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP5]], i64 7
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP5]], i32 7
 ; CHECK-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
 ; CHECK-NEXT:    [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[L_8:%.*]] = load i32, ptr [[T137]], align 4
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=slp-vectorizer,instcombine -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64--linux-gnu < %s | FileCheck %s

 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64"
@@ -133,7 +133,7 @@ define i16 @reduce_blockstrided2(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-LABEL: @reduce_blockstrided2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[X:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]]
@@ -165,7 +165,7 @@ define i16 @reduce_blockstrided2(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM15]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX32]], align 2
-; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX33]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX36]], align 2
@@ -254,9 +254,9 @@ define i16 @reduce_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-LABEL: @reduce_blockstrided3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[L0:%.*]] = load i16, ptr [[X:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 1
 ; CHECK-NEXT:    [[L1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 2
 ; CHECK-NEXT:    [[L2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]]
@@ -269,10 +269,12 @@ define i16 @reduce_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[ADD8]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM9]]
 ; CHECK-NEXT:    [[L6:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[STRIDE]], 3
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
 ; CHECK-NEXT:    [[L8:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 1
 ; CHECK-NEXT:    [[L9:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds nuw i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 2
 ; CHECK-NEXT:    [[L10:%.*]] = load i16, ptr [[ARRAYIDX16]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[L12:%.*]] = load i16, ptr [[ARRAYIDX20]], align 2
@@ -347,9 +349,9 @@ define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP6]])
 ; CHECK-NEXT:    ret i16 [[TMP7]]
 ;
@@ -414,33 +416,128 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4
+; TODO: Dead code must be removed below.
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32
+; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
+; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32
+; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
+; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1
+; CHECK-NEXT:    [[CONV16_2:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
+; CHECK-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP41]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP42]] to i32
+; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
+; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP43]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3
+; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
+; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP44]] to i32
+; CHECK-NEXT:    [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
+; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1
+; CHECK-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP16]] to i32
+; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP17]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP48]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP49]] to i32
+; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
+; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP50]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1
+; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
+; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP51]] to i32
+; CHECK-NEXT:    [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1
+; CHECK-NEXT:    [[CONV14_3:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1
+; CHECK-NEXT:    [[CONV16_3:%.*]] = zext i8 [[TMP23]] to i32
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[CONV21_3:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[CONV23_3:%.*]] = zext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1
+; CHECK-NEXT:    [[CONV26_3:%.*]] = zext i8 [[TMP26]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[CONV28_3:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1
+; CHECK-NEXT:    [[CONV33_3:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
+; CHECK-NEXT:    [[CONV35_3:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1
+; CHECK-NEXT:    [[CONV38_3:%.*]] = zext i8 [[TMP30]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
+; CHECK-NEXT:    [[CONV40_3:%.*]] = zext i8 [[TMP31]] to i32
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP38:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP38]], <4 x i8> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP1]], i64 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP5]], i64 12)
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP12]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP45:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP46:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP45]], <4 x i8> [[TMP12]], i64 4)
+; CHECK-NEXT:    [[TMP47:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP46]], <4 x i8> [[TMP3]], i64 8)
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP47]], <4 x i8> [[TMP13]], i64 12)
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <16 x i32> [[TMP11]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]])
 ; CHECK-NEXT:    ret i32 [[TMP21]]
 ;
@@ -677,7 +774,7 @@ entry:
 define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) {
 ; CHECK-LABEL: @store_blockstrided3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[X:%.*]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1
 ; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64
@@ -696,7 +793,7 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1
 ; CHECK-NEXT:    [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]]
-; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds nuw i8, ptr [[Y:%.*]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
@@ -705,20 +802,21 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
-; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds nuw i8, ptr [[Z:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, ptr [[Z:%.*]], i64 1
 ; CHECK-NEXT:    [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 24
+; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 28
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX84:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 7
 ; CHECK-NEXT:    [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
-; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds nuw i8, ptr [[Z]], i64 44
+; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 11
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i32>, ptr [[ARRAYIDX49]], align 4
@@ -727,9 +825,10 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
 ; CHECK-NEXT:    store i32 [[MUL81]], ptr [[ARRAYIDX76]], align 4
 ; CHECK-NEXT:    store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = mul nsw <2 x i32> [[TMP15]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <2 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP20]], <2 x i32> [[TMP21]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[ARRAYIDX84]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -837,9 +936,10 @@ define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
 ; CHECK-NEXT:    store <8 x i16> [[TMP6]], ptr [[DST0:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -912,35 +1012,132 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[OFF1:%.*]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4
-; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0:%.*]], i64 16
-; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0]], i64 32
-; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds nuw i8, ptr [[DST0]], i64 48
+; TODO: Dead code must be removed below.
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP34]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[TMP35:%.*]] = load i8, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP35]] to i32
+; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1
+; CHECK-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP36]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX10_2]], align 1
+; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP37]] to i32
+; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX13_2]], align 1
+; CHECK-NEXT:    [[CONV14_2:%.*]] = zext i8 [[TMP38]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 5
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX15_2]], align 1
+; CHECK-NEXT:    [[CONV16_2:%.*]] = zext i8 [[TMP39]] to i32
+; CHECK-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP40:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1
+; CHECK-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP40]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 2
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1
+; CHECK-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP41]] to i32
+; CHECK-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX25_2]], align 1
+; CHECK-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP42]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 6
+; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX27_2]], align 1
+; CHECK-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32
+; CHECK-NEXT:    [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr [[ARRAYIDX32_2]], align 1
+; CHECK-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP44]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 3
+; CHECK-NEXT:    [[TMP45:%.*]] = load i8, ptr [[ARRAYIDX34_2]], align 1
+; CHECK-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP45]] to i32
+; CHECK-NEXT:    [[ARRAYIDX37_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX37_2]], align 1
+; CHECK-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP46]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX39_2]], align 1
+; CHECK-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP47]] to i32
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ADD_PTR_2]], align 1
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32
+; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
+; CHECK-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ADD_PTR64_2]], align 1
+; CHECK-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP49]] to i32
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX3_3]], align 1
+; CHECK-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP50]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[TMP51:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; CHECK-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP51]] to i32
+; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
+; CHECK-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP20]] to i32
+; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 1
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
+; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP21]] to i32
+; CHECK-NEXT:    [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX13_3]], align 1
+; CHECK-NEXT:    [[CONV14_3:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 5
+; CHECK-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1
+; CHECK-NEXT:    [[CONV16_3:%.*]] = zext i8 [[TMP23]] to i32
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[CONV21_3:%.*]] = zext i8 [[TMP24]] to i32
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 2
+; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[CONV23_3:%.*]] = zext i8 [[TMP25]] to i32
+; CHECK-NEXT:    [[ARRAYIDX25_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX25_3]], align 1
+; CHECK-NEXT:    [[CONV26_3:%.*]] = zext i8 [[TMP26]] to i32
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 6
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[CONV28_3:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1
+; CHECK-NEXT:    [[CONV33_3:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[ARRAYIDX34_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 3
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
+; CHECK-NEXT:    [[CONV35_3:%.*]] = zext i8 [[TMP29]] to i32
+; CHECK-NEXT:    [[ARRAYIDX37_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, ptr [[ARRAYIDX37_3]], align 1
+; CHECK-NEXT:    [[CONV38_3:%.*]] = zext i8 [[TMP30]] to i32
+; CHECK-NEXT:    [[ARRAYIDX39_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
+; CHECK-NEXT:    [[CONV40_3:%.*]] = zext i8 [[TMP31]] to i32
+; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4
+; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8
+; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP6]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext <4 x i8> [[TMP12]] to <4 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <4 x i32> [[TMP11]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <4 x i8> [[TMP17]] to <4 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <4 x i32> [[TMP16]], [[TMP18]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[DST0]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[DST4]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP14]], ptr [[DST8]], align 4
@@ -1198,20 +1395,20 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST1:%.*]] to i64
 ; CHECK-NEXT:    [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P1:%.*]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P2:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_1]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]]
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR_2]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds nuw i8, ptr [[ADD_PTR64_2]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
@@ -1225,33 +1422,29 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP13]], <4 x i8> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP14]], <4 x i8> [[TMP8]], i64 8)
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP12]], i64 12)
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP5]], i64 4)
+; CHECK-NEXT:    [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP20]], <4 x i8> [[TMP9]], i64 8)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP19]], i64 12)
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP29:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP30:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP29]], <4 x i8> [[TMP6]], i64 4)
+; CHECK-NEXT:    [[TMP28:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP30]], <4 x i8> [[TMP10]], i64 8)
+; CHECK-NEXT:    [[TMP32:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP28]], <4 x i8> [[TMP27]], i64 12)
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP35:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP36:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP35]], <4 x i8> [[TMP7]], i64 4)
+; CHECK-NEXT:    [[TMP37:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP36]], <4 x i8> [[TMP11]], i64 8)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP37]], <4 x i8> [[TMP34]], i64 12)
 ; CHECK-NEXT:    [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], splat (i32 16)
@@ -1259,11 +1452,11 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP46]], <16 x i32> [[TMP45]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i32> [[TMP45]], <16 x i32> [[TMP46]], <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
 ; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
 ; CHECK-NEXT:    [[TMP49:%.*]] = add nsw <16 x i32> [[TMP47]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub nsw <16 x i32> [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP50]], <16 x i32> [[TMP49]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 20, i32 21, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 29, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP51]], [[TMP52]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[TMP51]], [[TMP52]]
@@ -1271,7 +1464,7 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]]
-; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP60:%.*]] = lshr <16 x i32> [[TMP59]], splat (i32 15)
 ; CHECK-NEXT:    [[TMP61:%.*]] = and <16 x i32> [[TMP60]], splat (i32 65537)
 ; CHECK-NEXT:    [[TMP62:%.*]] = mul nuw <16 x i32> [[TMP61]], splat (i32 65535)
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s

 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -167,7 +167,9 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
  %v0.0 = extractelement <2 x i32> %v0, i32 0
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s

 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -167,7 +167,9 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3_31:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3_31]]
 ;
  %v0.0 = extractelement <2 x i32> %v0, i32 0
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s

 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
@@ -264,8 +264,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -273,8 +276,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -317,19 +323,27 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
-; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
+; GFX8-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]])
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
-; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s

 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
@@ -12,7 +12,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -47,7 +47,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -82,7 +82,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -117,7 +117,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -152,7 +152,7 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -177,7 +177,7 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -202,7 +202,7 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -227,7 +227,7 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -255,7 +255,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX7-NEXT:    ret <3 x i16> [[INS_2]]
@@ -264,8 +264,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -273,8 +276,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -309,7 +315,7 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
 ; GFX7-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX7-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
@@ -317,19 +323,27 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
-; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
+; GFX8-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]])
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
-; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG2:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG2]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP4]], <2 x i16> [[TMP7]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX9 %s

 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
@@ -12,7 +12,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -24,7 +24,7 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -54,7 +54,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -66,7 +66,7 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -96,7 +96,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -108,7 +108,7 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -138,7 +138,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -150,7 +150,7 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
@@ -180,7 +180,7 @@ define <2 x i32> @uadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -205,7 +205,7 @@ define <2 x i32> @usub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.umax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -230,7 +230,7 @@ define <2 x i32> @sadd_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.smin.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -255,7 +255,7 @@ define <2 x i32> @ssub_sat_v2i32(<2 x i32> %arg0, <2 x i32> %arg1) {
 ; GCN-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i32> [[ARG1]], i64 1
 ; GCN-NEXT:    [[ADD_0:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_0]], i32 [[ARG1_0]])
 ; GCN-NEXT:    [[ADD_1:%.*]] = call i32 @llvm.smax.i32(i32 [[ARG0_1]], i32 [[ARG1_1]])
-; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> poison, i32 [[ADD_0]], i64 0
+; GCN-NEXT:    [[INS_0:%.*]] = insertelement <2 x i32> undef, i32 [[ADD_0]], i64 0
 ; GCN-NEXT:    [[INS_1:%.*]] = insertelement <2 x i32> [[INS_0]], i32 [[ADD_1]], i64 1
 ; GCN-NEXT:    ret <2 x i32> [[INS_1]]
 ;
@@ -283,7 +283,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX7-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX7-NEXT:    ret <3 x i16> [[INS_2]]
@@ -299,7 +299,7 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
@@ -308,8 +308,11 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -344,7 +347,7 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX7-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX7-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
 ; GFX7-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX7-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX7-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX7-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX7-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
@@ -358,9 +361,10 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
 ; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
 ; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP1:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP3]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> undef, i16 [[ADD_0]], i64 0
 ; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
 ; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -368,10 +372,14 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
-; GFX9-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX9-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP8:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP6]], <2 x i16> [[TMP7]])
 ; GFX9-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP4]])
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
--- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
+++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s

 ; Regression test for a bug in the SLP vectorizer that was causing
 ; these rotates to be incorrectly combined into a vector rotate.
@@ -9,16 +9,16 @@ target triple = "wasm32-unknown-unknown"

 define void @foo(<2 x i64> %x, <4 x i32> %y, ptr %out) #0 {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 2
+; CHECK-NEXT:    [[A:%.*]] = extractelement <2 x i64> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[B:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = zext i32 [[B]] to i64
 ; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[A]], i64 [[A]], i64 [[CONV6]])
 ; CHECK-NEXT:    store i64 [[C]], ptr [[OUT:%.*]], align 8
-; CHECK-NEXT:    [[D:%.*]] = extractelement <2 x i64> [[X]], i64 1
-; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> [[Y]], i64 3
+; CHECK-NEXT:    [[D:%.*]] = extractelement <2 x i64> [[X]], i32 1
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> [[Y]], i32 3
 ; CHECK-NEXT:    [[CONV17:%.*]] = zext i32 [[E]] to i64
 ; CHECK-NEXT:    [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]])
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[OUT]], i32 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[OUT]], i32 1
 ; CHECK-NEXT:    store i64 [[F]], ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    ret void
 ;
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2

 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i64 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i64 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
 ; SSE-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
 ; SSE-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
 ; SSE-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
@@ -24,31 +24,31 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
 ; SSE-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
 ; SSE-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; SSE-NEXT:    [[R23:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i64 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i64 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i64 6
-; SSE-NEXT:    [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i64 7
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x float> [[R71]]
 ;
 ; SLM-LABEL: @ceil_floor(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -56,20 +56,20 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SLM-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX-LABEL: @ceil_floor(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP9]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP10]])
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -77,20 +77,20 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX2

 define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-LABEL: @ceil_floor(
-; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
-; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i64 4
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i64 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
 ; SSE-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
 ; SSE-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
 ; SSE-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
@@ -24,75 +24,75 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SSE-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
 ; SSE-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
 ; SSE-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
-; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; SSE-NEXT:    [[R23:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
-; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i64 4
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i64 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i64 6
-; SSE-NEXT:    [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i64 7
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R71:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x float> [[R71]]
 ;
 ; SLM-LABEL: @ceil_floor(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SLM-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX-LABEL: @ceil_floor(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP9]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP10:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP10]])
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[TMP1:%.*]], i32 0
+; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX2-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP2]])
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> zeroinitializer, float [[AB0]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3
 ; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:    ret <8 x float> [[R71]]
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512

 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-LABEL: @sitofp_uitofp(
@@ -12,7 +12,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -20,7 +21,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
@@ -74,7 +76,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @fptosi_fptoui(
@@ -82,7 +85,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
@@ -132,37 +136,51 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {

 define <8 x float> @fneg_fabs(<8 x float> %a) {
 ; SSE2-LABEL: @fneg_fabs(
-; SSE2-NEXT:    [[A:%.*]] = fneg <8 x float> [[A1:%.*]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A1]])
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
+; SSE2-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SSE2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; SLM-LABEL: @fneg_fabs(
-; SLM-NEXT:    [[A:%.*]] = fneg <8 x float> [[A1:%.*]]
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A1]])
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
+; SLM-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SLM-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX-LABEL: @fneg_fabs(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX2-LABEL: @fneg_fabs(
-; AVX2-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX2-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX2-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX512-LABEL: @fneg_fabs(
-; AVX512-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX512-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX512-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
  %a0 = extractelement <8 x float> %a, i32 0
@@ -214,7 +232,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @sext_zext(
@@ -222,7 +241,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @sext_zext(
@@ -275,7 +295,9 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; CHECK-NEXT:    [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x float> [[R71]]
 ;
  %a0 = extractelement <4 x i32> %a, i32 0
@@ -321,7 +343,7 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    ret <8 x float> [[R71]]
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512

 define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-LABEL: @sitofp_uitofp(
@@ -12,7 +12,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
@@ -20,7 +21,8 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP6]], <4 x float> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
@@ -74,7 +76,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @fptosi_fptoui(
@@ -82,7 +85,8 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @fptosi_fptoui(
@@ -132,37 +136,51 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) {

 define <8 x float> @fneg_fabs(<8 x float> %a) {
 ; SSE2-LABEL: @fneg_fabs(
-; SSE2-NEXT:    [[A:%.*]] = fneg <8 x float> [[A1:%.*]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A1]])
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
+; SSE2-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SSE2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SSE2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; SLM-LABEL: @fneg_fabs(
-; SLM-NEXT:    [[A:%.*]] = fneg <8 x float> [[A1:%.*]]
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A1]])
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP3]], splat (i32 -2147483648)
+; SLM-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP4]], splat (i32 2147483647)
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
+; SLM-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
 ; SLM-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX-LABEL: @fneg_fabs(
-; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX2-LABEL: @fneg_fabs(
-; AVX2-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX2-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX2-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX2-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
 ; AVX512-LABEL: @fneg_fabs(
-; AVX512-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[A:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[A]])
-; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX512-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    [[DOTUNCASTED:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
 ; AVX512-NEXT:    ret <8 x float> [[DOTUNCASTED]]
 ;
  %a0 = extractelement <8 x float> %a, i32 0
@@ -214,7 +232,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SSE2-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @sext_zext(
@@ -222,7 +241,8 @@ define <8 x i32> @sext_zext(<8 x i16> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX-LABEL: @sext_zext(
@@ -275,7 +295,9 @@ define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
-; CHECK-NEXT:    [[R71:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x float> [[R71]]
 ;
  %a0 = extractelement <4 x i32> %a, i32 0
@@ -321,7 +343,7 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16
 ; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:    ret <8 x float> [[R71]]
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp-inseltpoison.ll
@@ -1,34 +1,46 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512

 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fadd_fsub_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @fadd_fsub_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @fadd_fsub_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX2-LABEL: @fadd_fsub_v8f32(
@@ -80,35 +92,51 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {

 define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX2-LABEL: @fmul_fdiv_v8f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX2-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX2-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX512-LABEL: @fmul_fdiv_v8f32(
@@ -154,30 +182,39 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {

 define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
 ; SSE-LABEL: @fmul_fdiv_v4f32_const(
-; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; SSE-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; SSE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v4f32_const(
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 1.000000e+00>
-; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], 1.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fdiv float [[A3]], 5.000000e-01
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
 ; SLM-NEXT:    ret <4 x float> [[R3]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v4f32_const(
-; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; AVX2-LABEL: @fmul_fdiv_v4f32_const(
-; AVX2-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX2-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX2-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; AVX512-LABEL: @fmul_fdiv_v4f32_const(
-; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX512-NEXT:    ret <4 x float> [[TMP1]]
 ;
  %a0 = extractelement <4 x float> %a, i32 0
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -1,34 +1,46 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX512

 define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fadd_fsub_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @fadd_fsub_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @fadd_fsub_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = fsub <4 x float> [[TMP2]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP11]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX2-LABEL: @fadd_fsub_v8f32(
@@ -80,35 +92,51 @@ define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {

 define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @fmul_fdiv_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SSE-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SSE-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; SLM-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; SLM-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX2-LABEL: @fmul_fdiv_v8f32(
-; AVX2-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <8 x float> [[A]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP3:%.*]], <8 x float> poison, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+; AVX2-NEXT:    [[TMP10:%.*]] = fdiv <4 x float> [[TMP2]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
+; AVX2-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[TMP11]], [[TMP4]]
+; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP7]], <4 x float> [[TMP10]], i64 4)
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 5, i32 1, i32 2, i32 6, i32 7, i32 3>
 ; AVX2-NEXT:    ret <8 x float> [[TMP5]]
 ;
 ; AVX512-LABEL: @fmul_fdiv_v8f32(
@@ -154,30 +182,39 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {

 define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
 ; SSE-LABEL: @fmul_fdiv_v4f32_const(
-; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; SSE-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; SSE-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; SLM-LABEL: @fmul_fdiv_v4f32_const(
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i64 3
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; SLM-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], <float 2.000000e+00, float 1.000000e+00>
-; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[AB2:%.*]] = fdiv float [[A2]], 1.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fdiv float [[A3]], 5.000000e-01
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[A2]], i64 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i64 3
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[TMP3]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
 ; SLM-NEXT:    ret <4 x float> [[R3]]
 ;
 ; AVX-LABEL: @fmul_fdiv_v4f32_const(
-; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; AVX2-LABEL: @fmul_fdiv_v4f32_const(
-; AVX2-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX2-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX2-NEXT:    ret <4 x float> [[TMP1]]
 ;
 ; AVX512-LABEL: @fmul_fdiv_v4f32_const(
-; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <4 x float> [[A]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; AVX512-NEXT:    ret <4 x float> [[TMP1]]
 ;
  %a0 = extractelement <4 x float> %a, i32 0
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -1,26 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512

 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @add_sub_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @add_sub_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]]
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32(
@@ -130,19 +136,25 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {

 define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_shl_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
@@ -204,7 +216,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32_const(
@@ -212,7 +225,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
@@ -262,24 +276,25 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {

 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6
-; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @ashr_lshr_shl_v8i32(
@@ -293,7 +308,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SLM-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; SLM-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SLM-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX1-LABEL: @ashr_lshr_shl_v8i32(
@@ -307,7 +324,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX1-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX2-LABEL: @ashr_lshr_shl_v8i32(
@@ -321,7 +340,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX512-LABEL: @ashr_lshr_shl_v8i32(
@@ -335,7 +356,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX512-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX512-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX512-NEXT:    ret <8 x i32> [[R71]]
 ;
  %a0 = extractelement <8 x i32> %a, i32 0
@@ -407,100 +430,130 @@ define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) {

 define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 ; SSE-LABEL: @sdiv_v8i32_undefs(
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; SSE-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; SSE-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; SSE-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SSE-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; SSE-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; SSE-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; SSE-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @sdiv_v8i32_undefs(
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; SLM-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; SLM-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; SLM-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SLM-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; SLM-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; SLM-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; SLM-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX1-LABEL: @sdiv_v8i32_undefs(
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX1-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX1-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; AVX1-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; AVX1-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX1-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX1-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; AVX1-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @sdiv_v8i32_undefs(
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX2-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX2-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX2-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX2-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX2-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX2-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; AVX2-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; AVX2-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX2-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
 ; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; AVX2-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX512-LABEL: @sdiv_v8i32_undefs(
-; AVX512-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX512-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; AVX512-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX512-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX512-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX512-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX512-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX512-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; AVX512-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; AVX512-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX512-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i64 1
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> poison, i32 [[AB0]], i32 0
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
 ; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; AVX512-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 8, i32 9>
+; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX512-NEXT:    ret <8 x i32> [[R71]]
 ;
  %a0 = extractelement <8 x i32> %a, i32 0
@@ -533,26 +586,28 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; SSE-LABEL: @add_sub_v8i32_splat(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @add_sub_v8i32_splat(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SLM-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32_splat(
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX1-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
@@ -560,7 +615,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; AVX1-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX2-LABEL: @add_sub_v8i32_splat(
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
@@ -568,7 +623,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; AVX2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX512-LABEL: @add_sub_v8i32_splat(
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -1,26 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512

 define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @add_sub_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @add_sub_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP4]]
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP9]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32(
@@ -130,19 +136,25 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {

 define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_shl_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
-; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP2:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP4]]
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shl <4 x i32> [[TMP3]], [[TMP9]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32(
@@ -204,7 +216,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; SLM-LABEL: @ashr_shl_v8i32_const(
@@ -212,7 +225,8 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 ; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 2)
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 3)
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP6]], <4 x i32> [[TMP4]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX1-LABEL: @ashr_shl_v8i32_const(
@@ -262,24 +276,25 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {

 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6
-; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
+; SSE-NEXT:    [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @ashr_lshr_shl_v8i32(
@@ -293,7 +308,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SLM-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; SLM-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SLM-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SLM-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX1-LABEL: @ashr_lshr_shl_v8i32(
@@ -307,7 +324,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX1-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX1-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX1-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX1-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX2-LABEL: @ashr_lshr_shl_v8i32(
@@ -321,7 +340,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX2-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX512-LABEL: @ashr_lshr_shl_v8i32(
@@ -335,7 +356,9 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; AVX512-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX512-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP6]], [[TMP7]]
 ; AVX512-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; AVX512-NEXT:    [[R71:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX512-NEXT:    ret <8 x i32> [[R71]]
 ;
  %a0 = extractelement <8 x i32> %a, i32 0
@@ -407,100 +430,130 @@ define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) {

 define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 ; SSE-LABEL: @sdiv_v8i32_undefs(
-; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; SSE-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; SSE-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; SSE-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SSE-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; SSE-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; SSE-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; SSE-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
-; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; SLM-LABEL: @sdiv_v8i32_undefs(
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; SLM-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; SLM-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; SLM-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; SLM-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; SLM-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; SLM-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; SLM-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SLM-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX1-LABEL: @sdiv_v8i32_undefs(
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i64 2
-; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i64 3
-; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i64 6
-; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; AVX1-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; AVX1-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX1-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX1-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX1-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
 ; AVX1-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; AVX1-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX1-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX1-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
 ; AVX1-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
-; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
-; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i64 2
-; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i64 3
-; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i64 5
-; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i64 6
-; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; AVX1-NEXT:    ret <8 x i32> [[R7]]
 ;
 ; AVX2-LABEL: @sdiv_v8i32_undefs(
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX2-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX2-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX2-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX2-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX2-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX2-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; AVX2-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; AVX2-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX2-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
-; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
 ; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 4, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; AVX2-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX2-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX2-NEXT:    ret <8 x i32> [[R71]]
 ;
 ; AVX512-LABEL: @sdiv_v8i32_undefs(
-; AVX512-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 1
-; AVX512-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; AVX512-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX512-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX512-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; AVX512-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; AVX512-NEXT:    [[AB0:%.*]] = sdiv i32 [[A0]], undef
 ; AVX512-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
 ; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 2, i32 3>
 ; AVX512-NEXT:    [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], <i32 8, i32 16>
+; AVX512-NEXT:    [[AB4:%.*]] = sdiv i32 [[A4]], undef
 ; AVX512-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
 ; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 6, i32 7>
 ; AVX512-NEXT:    [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], <i32 8, i32 16>
-; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 undef, i32 poison, i32 poison, i32 poison>, i32 [[AB1]], i64 1
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
 ; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 4, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i64 5
+; AVX512-NEXT:    [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:    [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX512-NEXT:    ret <8 x i32> [[R71]]
 ;
  %a0 = extractelement <8 x i32> %a, i32 0
@@ -533,26 +586,28 @@ define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
 define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; SSE-LABEL: @add_sub_v8i32_splat(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SSE-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @add_sub_v8i32_splat(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0
+; SLM-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SLM-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
 ; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP8]], <4 x i32> [[TMP6]], i64 4)
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX1-LABEL: @add_sub_v8i32_splat(
-; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX1-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX1-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX1-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
@@ -560,7 +615,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; AVX1-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX2-LABEL: @add_sub_v8i32_splat(
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX2-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX2-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
@@ -568,7 +623,7 @@ define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
 ; AVX2-NEXT:    ret <8 x i32> [[TMP5]]
 ;
 ; AVX512-LABEL: @add_sub_v8i32_splat(
-; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i64 0
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[B:%.*]], i32 0
 ; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
 ; AVX512-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle-inseltpoison.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s

 define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @g(
@@ -86,8 +86,8 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
@@ -107,14 +107,14 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {

 define i8 @k(<4 x i8> %x) {
 ; CHECK-LABEL: @k(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
@@ -136,14 +136,14 @@ define i8 @k_bb(<4 x i8> %x) {
 ; CHECK-LABEL: @k_bb(
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 | FileCheck %s

 define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @g(
@@ -39,8 +39,14 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {

 define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @h_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> <i8 undef, i8 poison, i8 poison, i8 poison>, <4 x i32> <i32 4, i32 3, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> undef, i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[X0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[X3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[Y1]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[Y2]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[TMP2]], [[TMP2]]
 ; CHECK-NEXT:    ret <4 x i8> [[TMP3]]
 ;
@@ -87,8 +93,8 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[X]], <4 x i8> [[Y]], <2 x i32> <i32 3, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    ret i8 [[TMP8]]
 ;
@@ -108,14 +114,14 @@ define i8 @j(<4 x i8> %x, <4 x i8> %y) {

 define i8 @k(<4 x i8> %x) {
 ; CHECK-LABEL: @k(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
@@ -137,14 +143,14 @@ define i8 @k_bb(<4 x i8> %x) {
 ; CHECK-LABEL: @k_bb(
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1:%.*]], <4 x i8> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i8> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP2]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i8> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i8> [[TMP6]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = sdiv i8 [[TMP7]], [[TMP8]]
 ; CHECK-NEXT:    ret i8 [[TMP9]]
 ;
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s

 ;
 ; Check that we can commute operands based on the predicate.
--- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s

 ;
 ; Check that we can commute operands based on the predicate.
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX

 ;
 ; 128-bit vectors
@@ -169,21 +169,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4f64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4f64(
@@ -214,14 +214,14 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @test_v4f64_partial_swizzle(
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
+; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i32 3
 ; CHECK-NEXT:    ret <4 x double> [[R03]]
 ;
  %a0 = extractelement <4 x double> %a, i64 0
@@ -243,21 +243,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -305,21 +305,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-LABEL: @test_v4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4i64(
@@ -351,21 +351,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @test_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8i32(
@@ -413,21 +413,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v16i16(
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX

 ;
 ; 128-bit vectors
@@ -169,21 +169,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4f64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4f64(
@@ -214,15 +214,15 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64_partial_swizzle(
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE-NEXT:    [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3
+; SSE-NEXT:    [[R021:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R021]], double [[R3]], i32 3
 ; SSE-NEXT:    ret <4 x double> [[R03]]
 ;
 ; SLM-LABEL: @test_v4f64_partial_swizzle(
@@ -232,21 +232,21 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double poison, double poison>, double [[R0]], i64 0
+; SLM-NEXT:    [[R00:%.*]] = insertelement <4 x double> zeroinitializer, double [[R0]], i32 0
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SLM-NEXT:    ret <4 x double> [[R031]]
 ;
 ; AVX-LABEL: @test_v4f64_partial_swizzle(
-; AVX-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; AVX-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; AVX-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i32 2
+; AVX-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
 ; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; AVX-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
 ; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; AVX-NEXT:    [[R0212:%.*]] = insertelement <4 x double> [[TMP4]], double 0.000000e+00, i64 1
-; AVX-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R0212]], double [[R3]], i64 3
+; AVX-NEXT:    [[R021:%.*]] = shufflevector <4 x double> zeroinitializer, <4 x double> [[TMP4]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+; AVX-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[R021]], double [[R3]], i32 3
 ; AVX-NEXT:    ret <4 x double> [[R03]]
 ;
  %a0 = extractelement <4 x double> %a, i64 0
@@ -268,21 +268,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -330,21 +330,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-LABEL: @test_v4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4i64(
@@ -376,21 +376,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @test_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8i32(
@@ -438,21 +438,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP7:%.*]] = add <16 x i16> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v16i16(
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512

 ;
 ; 128-bit vectors
@@ -148,21 +148,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4f64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4f64(
@@ -194,21 +194,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -256,21 +256,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-LABEL: @test_v4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4i64(
@@ -302,21 +302,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @test_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8i32(
@@ -364,21 +364,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v16i16(
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512

 ;
 ; 128-bit vectors
@@ -148,21 +148,21 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 ; SSE-LABEL: @test_v4f64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4f64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = fsub <4 x double> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x double> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4f64(
@@ -194,21 +194,21 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 ; SSE-LABEL: @test_v8f32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8f32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x float> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8f32(
@@ -256,21 +256,21 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-LABEL: @test_v4i64(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v4i64(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <4 x i64> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v4i64(
@@ -302,21 +302,21 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @test_v8i32(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v8i32(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <8 x i32> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v8i32(
@@ -364,21 +364,21 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: @test_v16i16(
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]]
 ; SSE-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; SLM-LABEL: @test_v16i16(
 ; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
 ; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
 ; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[TMP5]], [[TMP6]]
 ; SLM-NEXT:    ret <16 x i16> [[TMP7]]
 ;
 ; AVX-LABEL: @test_v16i16(
--- a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+sse2     -S | FileCheck %s --check-prefix=SSE
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx      -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx2     -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512f  -S | FileCheck %s --check-prefix=AVX
-; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer,instcombine -mattr=+avx512vl -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+sse2     -S | FileCheck %s --check-prefix=SSE
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx      -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx2     -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx512f  -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -slp-threshold=-6 -passes=slp-vectorizer -mattr=+avx512vl -S | FileCheck %s --check-prefix=AVX

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -17,15 +17,15 @@ target triple = "x86_64-unknown-linux-gnu"
 define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; SSE-LABEL: @PR31243_zext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; SSE-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1)
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; SSE-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; SSE-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 [[TMP6]]
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]]
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]]
 ; SSE-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; SSE-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; SSE-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -33,15 +33,15 @@ define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ;
 ; AVX-LABEL: @PR31243_zext(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1)
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; AVX-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
-; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; AVX-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 [[TMP6]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
+; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]]
 ; AVX-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; AVX-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; AVX-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -76,15 +76,15 @@ entry:
 define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ; SSE-LABEL: @PR31243_sext(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; SSE-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; SSE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; SSE-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1)
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; SSE-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
-; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; SSE-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i32
+; SSE-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]]
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i32
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]]
 ; SSE-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; SSE-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; SSE-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
@@ -92,15 +92,15 @@ define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, ptr %ptr) {
 ;
 ; AVX-LABEL: @PR31243_sext(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i64 0
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i64 1
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> poison, i8 [[V0:%.*]], i32 0
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
 ; AVX-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], splat (i8 1)
-; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i64 0
-; AVX-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i64
-; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 [[TMP4]]
-; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i64 1
-; AVX-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i64
-; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[TMP6]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = sext i8 [[TMP3]] to i32
+; AVX-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i32 [[TMP4]]
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = sext i8 [[TMP5]] to i32
+; AVX-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 [[TMP6]]
 ; AVX-NEXT:    [[T6:%.*]] = load i8, ptr [[T4]], align 1
 ; AVX-NEXT:    [[T7:%.*]] = load i8, ptr [[T5]], align 1
 ; AVX-NEXT:    [[T8:%.*]] = add i8 [[T6]], [[T7]]
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: opt < %s -passes=slp-vectorizer,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2

 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"

@@ -10,16 +10,16 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) {
 ; CHECK-LABEL: @shuffle_operands1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; SSE2-LABEL: @shuffle_operands1(
 ; SSE2-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
-; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i32 0
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i32 1
 ; SSE2-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; SSE2-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    ret void
@@ -43,7 +43,7 @@ define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -57,7 +57,7 @@ define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -91,7 +91,7 @@ define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -105,7 +105,7 @@ define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -139,7 +139,7 @@ define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -153,7 +153,7 @@ define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i32 0
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -185,12 +185,12 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, ptr [[FROM:%.*]], i64 1
 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
@@ -205,7 +205,7 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -239,7 +239,7 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -253,7 +253,7 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -286,13 +286,13 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, ptr [[FROM:%.*]], i64 1
 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -306,7 +306,7 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1
 ; SSE2-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; SSE2-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
 ; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
 ; SSE2-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; SSE2-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
@@ -347,23 +347,20 @@ define void @good_load_order() {
 ; CHECK:       for.body3:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
-; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP12]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]]
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]]
 ; CHECK-NEXT:    [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4
 ; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]]
 ; CHECK-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
@@ -382,23 +379,20 @@ define void @good_load_order() {
 ; SSE2:       for.body3:
 ; SSE2-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP13:%.*]], [[FOR_BODY3]] ]
 ; SSE2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
-; SSE2-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; SSE2-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
-; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
-; SSE2-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; SSE2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
-; SSE2-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; SSE2-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
-; SSE2-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
+; SSE2-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 1
+; SSE2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP2]]
+; SSE2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
+; SSE2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; SSE2-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]]
 ; SSE2-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
 ; SSE2-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
 ; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
-; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i64 0
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP12]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
 ; SSE2-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
 ; SSE2-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4
 ; SSE2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
-; SSE2-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP12]]
+; SSE2-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]]
 ; SSE2-NEXT:    [[TMP13]] = load float, ptr [[ARRAYIDX41]], align 4
 ; SSE2-NEXT:    [[MUL45:%.*]] = fmul float [[TMP13]], [[TMP7]]
 ; SSE2-NEXT:    store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr40522.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-- -mcpu=corei7 < %s | FileCheck %s

 define void @test1(float %a, float %b, float %c, float %d, ptr nocapture %p) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    ret void
@@ -30,10 +30,10 @@ entry:
 define void @test1_vec(float %a, float %b, float %c, float %d, ptr nocapture %p) {
 ; CHECK-LABEL: @test1_vec(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[C:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[D:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 16, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
@@ -54,10 +54,10 @@ entry:
 define void @test2(i32 %a, i32 %b, i32 %c, i32 %d, ptr nocapture %p) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[D:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 1)
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
@@ -79,10 +79,10 @@ entry:

 define void @test2_vec(i32 %0, i32 %1, i32 %2, i32 %3, ptr nocapture %4) {
 ; CHECK-LABEL: @test2_vec(
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i64 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], splat (i32 1)
 ; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP4:%.*]], align 16, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,19 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s

 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i32(
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], splat (i32 15)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP8]], <4 x i32> splat (i32 255))
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult <4 x i32> [[TMP8]], splat (i32 255)
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> splat (i32 255)
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    ret void
 ;
  %4 = load i32, ptr %0, align 4, !tbaa !2
@@ -50,13 +51,14 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i8(
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[TMP0:%.*]], align 1, !tbaa [[TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], splat (i32 15)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP9]], <4 x i32> splat (i32 255))
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc nuw <4 x i32> [[TMP10]] to <4 x i8>
-; CHECK-NEXT:    store <4 x i8> [[TMP11]], ptr [[TMP0]], align 1, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], splat (i32 255)
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> splat (i32 255)
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc <4 x i32> [[TMP11]] to <4 x i8>
+; CHECK-NEXT:    store <4 x i8> [[TMP12]], ptr [[TMP0]], align 1, !tbaa [[TBAA4]]
 ; CHECK-NEXT:    ret void
 ;
  %4 = load i8, ptr %0, align 1, !tbaa !6
@@ -101,7 +103,7 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i64(
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15)
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512


@b = global [8 x i32] zeroinitializer, align 16
@@ -13,14 +13,14 @@ define void @foo() {
 ; SSE-LABEL: @foo(
 ; SSE-NEXT:    [[TMP1:%.*]] = load i32, ptr @b, align 16
 ; SSE-NEXT:    store i32 [[TMP1]], ptr @a, align 16
-; SSE-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 4), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 8), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 12), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 16), align 16
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 20), align 4
-; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 24), align 8
-; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 28), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @b, i64 0, i64 2), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 2), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 4), align 16
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[TMP1]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 6), align 8
+; SSE-NEXT:    store i32 [[TMP2]], ptr getelementptr inbounds ([8 x i32], ptr @a, i64 0, i64 7), align 4
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @foo(
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -1,89 +1,104 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512F
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512F
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL

 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; SSE-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX512F-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512F-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512F-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512VL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX512VL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX512VL-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
  %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -107,64 +122,64 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl

 define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
 ; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
 ; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_2(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -172,8 +187,8 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -207,39 +222,39 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; SSE-LABEL: @gather_load_3(
 ; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
 ; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
 ; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
 ; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
 ; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
 ; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
 ; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 20
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5
 ; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
+; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
 ; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 24
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6
 ; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
 ; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 28
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7
 ; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
 ; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
 ; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]]
@@ -247,28 +262,28 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
 ; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
 ; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
 ; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
 ; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
 ; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
-; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6
+; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
@@ -340,20 +355,20 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado

 define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[T0:%.*]], i64 4
-; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 8
-; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 12
-; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 16
-; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 20
-; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 24
-; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 28
-; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2
+; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
+; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3
+; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 4
+; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
+; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 5
+; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
+; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 6
+; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
+; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7
+; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
 ; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -381,13 +396,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
+; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
+; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
+; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
+; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
+; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
 ; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -396,14 +411,14 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
 ; AVX-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
@@ -481,153 +496,154 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 16
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
 ; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
 ; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i32 3
 ; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
 ; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0
-; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1
-; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2
-; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3
-; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i32 2
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i32 3
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i32 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i32 3
 ; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
 ; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
 ; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
 ; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
 ; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
 ; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
@@ -637,6 +653,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: @gather_load_div(
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -1,89 +1,104 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512F
-; RUN:  opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse2     | FileCheck %s --check-prefixes=SSE
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx      | FileCheck %s --check-prefixes=AVX
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx2     | FileCheck %s --check-prefixes=AVX2
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX512F
+; RUN:  opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512VL

 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; SSE-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; SSE-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; SSE-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; SSE-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX512F-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512F-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512F-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512F-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512F-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512F-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
-; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
-; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
-; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
-; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512VL-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; AVX512VL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512VL-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512VL-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
 ; AVX512VL-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
  %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -107,64 +122,64 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl

 define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
 ; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
 ; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
 ; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
 ; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
-; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
-; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
-; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i32 1
+; AVX2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 3
 ; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
 ; AVX2-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_2(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512F-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -172,8 +187,8 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4
-; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr [[TMP3]], i32 4, <10 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <10 x i32> poison), !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 9>
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 3, i32 4, i32 2>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -207,39 +222,39 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; SSE-LABEL: @gather_load_3(
 ; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
 ; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
 ; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
 ; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
 ; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 12
+; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
 ; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
 ; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
 ; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
 ; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 20
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5
 ; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
+; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
 ; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 24
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6
 ; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
 ; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 28
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7
 ; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
 ; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
 ; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]]
@@ -247,28 +262,28 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
 ; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
 ; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
 ; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
 ; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
 ; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
-; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2
-; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3
-; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4
-; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5
-; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6
-; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i32 2
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i32 3
+; AVX-NEXT:    [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i32 4
+; AVX-NEXT:    [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i32 5
+; AVX-NEXT:    [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i32 6
+; AVX-NEXT:    [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i32 7
 ; AVX-NEXT:    [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
@@ -340,20 +355,20 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado

 define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds nuw i8, ptr [[T0:%.*]], i64 4
-; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 8
-; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 12
-; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 16
-; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 20
-; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 24
-; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds nuw i8, ptr [[T0]], i64 28
-; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2
+; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
+; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3
+; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 4
+; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
+; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 5
+; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
+; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 6
+; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
+; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7
+; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
 ; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -381,13 +396,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44
-; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16
-; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
+; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
+; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
+; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
+; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
+; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
+; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
 ; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -396,14 +411,14 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i32 0
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i32 2
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i32 3
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i32 4
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i32 5
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i32 6
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i32 7
 ; AVX-NEXT:    [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
 ; AVX-NEXT:    store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
@@ -481,153 +496,154 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0:%.*]], i64 16
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
 ; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
-; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
 ; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i32 3
 ; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
 ; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0
-; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1
-; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2
-; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3
-; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i32 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i32 1
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i32 2
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i32 3
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i32 0
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i32 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i32 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i32 3
 ; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
 ; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
 ; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
 ; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 52
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 176
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 68
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
 ; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 132
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
 ; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 32
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
 ; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 120
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
 ; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
 ; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 108
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
 ; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 80
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
 ; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 92
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
 ; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
 ; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i32 5
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i32 6
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i32 7
 ; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i32 3
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i32 4
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i32 5
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i32 6
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i32 7
 ; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
 ; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
 ; AVX512F-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512F-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512F-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512F-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512F-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
@@ -637,6 +653,7 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX512VL-LABEL: @gather_load_div(
 ; AVX512VL-NEXT:    [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <45 x float> poison), !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> <i32 0, i32 3, i32 4, i32 5, i32 8, i32 10, i32 11, i32 13, i32 14, i32 17, i32 20, i32 23, i32 27, i32 30, i32 33, i32 44>
 ; AVX512VL-NEXT:    [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 0, i32 3, i32 5, i32 8, i32 10, i32 14, i32 17, i32 20>
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> <i32 4, i32 11, i32 27, i32 30, i32 13, i32 44, i32 33, i32 23>
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]]
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer,instcombine -S < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s | FileCheck %s
 ; These code should be fully vectorized by D57059 patch

 target triple = "x86_64-unknown-linux-gnu"

 define <4 x i32> @foo(<4 x i32> %x, i32 %f) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i64 0
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[F]], 1
-; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0
+; CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -28,7 +28,7 @@ define <4 x i32> @foo(<4 x i32> %x, i32 %f) {

 define <4 x i32> @bar(<4 x i32> %x, i32 %f) {
 ; CHECK-LABEL: @bar(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll
@@ -1,12 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -passes=slp-vectorizer,instcombine -S < %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -passes=slp-vectorizer -S < %s | FileCheck %s

 ; These conversions should be vectorized by reviews.llvm.org/D57059

 define dso_local <4 x float> @foo(<4 x i32> %0) {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0:%.*]] to <4 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0]] to <4 x float>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 ;
  %2 = extractelement <4 x i32> %0, i32 1
--- a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer,instcombine -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s

 define float @dotf(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: @dotf(
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
-; RUN: opt -passes=slp-vectorizer,instcombine -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s --check-prefix COMBINE
+; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s --check-prefix COMBINE

 define void @test1(ptr %in, ptr %out) {
 ; CHECK-LABEL: @test1(
@@ -19,8 +19,14 @@ define void @test1(ptr %in, ptr %out) {
 ; COMBINE-LABEL: @test1(
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
-; COMBINE-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64>
-; COMBINE-NEXT:    store <8 x i64> [[TMP1]], ptr [[OUT:%.*]], align 8
+; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
+; COMBINE-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
+; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
+; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    store <8 x i64> [[TMP1]], ptr [[OUT]], align 8
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -61,9 +67,14 @@ define void @test2(ptr %in, ptr %out) {
 ; COMBINE-LABEL: @test2(
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
-; COMBINE-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64>
+; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0
+; COMBINE-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
+; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
+; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    store <8 x i64> [[TMP2]], ptr [[OUT:%.*]], align 8
+; COMBINE-NEXT:    store <8 x i64> [[TMP2]], ptr [[OUT]], align 8
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -99,8 +110,11 @@ define void @test3(<16 x i32> %0, ptr %out) {
 ;
 ; COMBINE-LABEL: @test3(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
-; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; COMBINE-NEXT:    [[TMP3:%.*]] = call <64 x i32> @llvm.vector.insert.v64i32.v16i32(<64 x i32> poison, <16 x i32> [[TMP0:%.*]], i64 0)
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
+; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -134,8 +148,12 @@ define void @test4(ptr %in, ptr %out) {
 ; COMBINE-LABEL: @test4(
 ; COMBINE-NEXT:  entry:
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4
-; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; COMBINE-NEXT:    [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0
+; COMBINE-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0)
+; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; COMBINE-NEXT:    store <16 x i32> [[TMP1]], ptr [[OUT]], align 4
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -165,7 +183,11 @@ define void @test5(ptr %out) {
 ;
 ; COMBINE-LABEL: @test5(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    store <8 x i32> zeroinitializer, ptr [[OUT:%.*]], align 4
+; COMBINE-NEXT:    [[TMP0:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> zeroinitializer, i64 0)
+; COMBINE-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 8)
+; COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 0
+; COMBINE-NEXT:    store <8 x i32> [[TMP2]], ptr [[TMP3]], align 4
 ; COMBINE-NEXT:    ret void
 ;
 entry:
@@ -219,28 +241,37 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) {
 ;
 ; COMBINE-LABEL: @test6(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[IN0:%.*]], i64 32
+; COMBINE-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[IN0:%.*]], i64 32
 ; COMBINE-NEXT:    [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16
 ; COMBINE-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; COMBINE-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1
-; COMBINE-NEXT:    [[TMP2:%.*]] = uitofp <32 x i8> [[TMP1]] to <32 x float>
+; COMBINE-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP4:%.*]] = zext <32 x i8> [[TMP11]] to <32 x i16>
+; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP2:%.*]] = uitofp <32 x i16> [[TMP19]] to <32 x float>
 ; COMBINE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP3]], <4 x float> [[LOAD2]], i64 8)
 ; COMBINE-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; COMBINE-NEXT:    [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]]
-; COMBINE-NEXT:    [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32
-; COMBINE-NEXT:    [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128
+; COMBINE-NEXT:    [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32
+; COMBINE-NEXT:    [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2:%.*]], i64 128
 ; COMBINE-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16
 ; COMBINE-NEXT:    store <32 x float> [[TMP7]], ptr [[IN2]], align 16
 ; COMBINE-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
-; COMBINE-NEXT:    [[TMP9:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float>
-; COMBINE-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; COMBINE-NEXT:    [[TMP15:%.*]] = shufflevector <16 x float> [[TMP12]], <16 x float> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; COMBINE-NEXT:    [[TMP13:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> [[LOAD5]], i64 0)
+; COMBINE-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16>
+; COMBINE-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; COMBINE-NEXT:    [[TMP9:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float>
+; COMBINE-NEXT:    [[TMP20:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> [[LOAD2]], i64 0)
+; COMBINE-NEXT:    [[TMP21:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP8]], i64 0)
+; COMBINE-NEXT:    [[TMP22:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP20]], <4 x float> [[TMP21]], i64 4)
+; COMBINE-NEXT:    [[TMP23:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP8]], i64 4)
+; COMBINE-NEXT:    [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP22]], <4 x float> [[TMP23]], i64 8)
 ; COMBINE-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
 ; COMBINE-NEXT:    [[TMP17:%.*]] = fmul <16 x float> [[TMP16]], [[TMP9]]
 ; COMBINE-NEXT:    store <16 x float> [[TMP17]], ptr [[GEP11]], align 16
@@ -353,7 +384,21 @@ define i32 @test7() {
 ;
 ; COMBINE-LABEL: @test7(
 ; COMBINE-NEXT:  entry:
-; COMBINE-NEXT:    store <16 x float> poison, ptr null, align 16
+; COMBINE-NEXT:    [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> zeroinitializer, i64 0)
+; COMBINE-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> zeroinitializer, i64 8)
+; COMBINE-NEXT:    [[TMP2:%.*]] = fsub <16 x float> [[TMP1]], [[TMP1]]
+; COMBINE-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[TMP1]], [[TMP1]]
+; COMBINE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0)
+; COMBINE-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 4)
+; COMBINE-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP7]], <4 x float> zeroinitializer, i64 8)
+; COMBINE-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12)
+; COMBINE-NEXT:    [[TMP10:%.*]] = fadd <16 x float> [[TMP9]], [[TMP5]]
+; COMBINE-NEXT:    [[TMP11:%.*]] = fsub <16 x float> [[TMP9]], [[TMP5]]
+; COMBINE-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; COMBINE-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[TMP9]], [[TMP12]]
+; COMBINE-NEXT:    store <16 x float> [[TMP13]], ptr null, align 16
 ; COMBINE-NEXT:    ret i32 0
 ;
 entry: