Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
Nikita Popov 90ba33099c [InstCombine] Canonicalize constant GEPs to i8 source element type (#68882)
This patch canonicalizes getelementptr instructions with constant
indices to use the `i8` source element type. This makes it easier for
optimizations to recognize that two GEPs are identical, because they
don't need to see past many different ways to express the same offset.

This is a first step towards
https://discourse.llvm.org/t/rfc-replacing-getelementptr-with-ptradd/68699.
This is limited to constant GEPs only for now, as they have a clear
canonical form, while we're not yet sure how exactly to deal with
variable indices.

The test llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll gives
two representative examples of the kind of optimization improvement we
expect from this change. In the first test SimplifyCFG can now realize
that all switch branches are actually the same. In the second test it
can convert it into simple arithmetic. These are representative of
common optimization failures we see in Rust.

Fixes https://github.com/llvm/llvm-project/issues/69841.
2024-01-24 15:25:29 +01:00

600 lines
26 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
; RUN: opt < %s -passes=slp-vectorizer,instcombine,dce -slp-threshold=-100 -S -mtriple=i386-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
; Make sure we order the operands of commutative operations so that we get
; bigger vectorizable trees.
define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) {
; CHECK-LABEL: @shuffle_operands1(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @shuffle_operands1(
; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0
; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1
; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: ret void
;
%from_1 = getelementptr double, ptr %from, i64 1
%v0_1 = load double , ptr %from
%v0_2 = load double , ptr %from_1
%v1_1 = fadd double %v0_1, %v1
%v1_2 = fadd double %v2, %v0_2
%to_2 = getelementptr double, ptr %to, i64 1
store double %v1_1, ptr %to
store double %v1_2, ptr %to_2
ret void
}
define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
; CHECK-LABEL: @vecload_vs_broadcast(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @vecload_vs_broadcast(
; SSE2-NEXT: entry:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
;
entry:
br label %lp
lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double, ptr %from, i64 1
%v0_1 = load double , ptr %from
%v0_2 = load double , ptr %from_1
%v1_1 = fadd double %v0_1, %p
%v1_2 = fadd double %v0_1, %v0_2
%to_2 = getelementptr double, ptr %to, i64 1
store double %v1_1, ptr %to
store double %v1_2, ptr %to_2
br i1 %c, label %lp, label %ext
ext:
ret void
}
define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
; CHECK-LABEL: @vecload_vs_broadcast2(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @vecload_vs_broadcast2(
; SSE2-NEXT: entry:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
;
entry:
br label %lp
lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double, ptr %from, i64 1
%v0_1 = load double , ptr %from
%v0_2 = load double , ptr %from_1
%v1_1 = fadd double %p, %v0_1
%v1_2 = fadd double %v0_2, %v0_1
%to_2 = getelementptr double, ptr %to, i64 1
store double %v1_1, ptr %to
store double %v1_2, ptr %to_2
br i1 %c, label %lp, label %ext
ext:
ret void
}
define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
; CHECK-LABEL: @vecload_vs_broadcast3(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @vecload_vs_broadcast3(
; SSE2-NEXT: entry:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
;
entry:
br label %lp
lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double, ptr %from, i64 1
%v0_1 = load double , ptr %from
%v0_2 = load double , ptr %from_1
%v1_1 = fadd double %p, %v0_1
%v1_2 = fadd double %v0_1, %v0_2
%to_2 = getelementptr double, ptr %to, i64 1
store double %v1_1, ptr %to
store double %v1_2, ptr %to_2
br i1 %c, label %lp, label %ext
ext:
ret void
}
define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
; CHECK-LABEL: @shuffle_nodes_match1(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @shuffle_nodes_match1(
; SSE2-NEXT: entry:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
;
entry:
br label %lp
lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double, ptr %from, i64 1
%v0_1 = load double , ptr %from
%v0_2 = load double , ptr %from_1
%v1_1 = fadd double %v0_2, %v0_1
%v1_2 = fadd double %p, %v0_1
%to_2 = getelementptr double, ptr %to, i64 1
store double %v1_1, ptr %to
store double %v1_2, ptr %to_2
br i1 %c, label %lp, label %ext
ext:
ret void
}
define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
; CHECK-LABEL: @vecload_vs_broadcast4(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @vecload_vs_broadcast4(
; SSE2-NEXT: entry:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
;
entry:
br label %lp
lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double, ptr %from, i64 1
%v0_1 = load double , ptr %from
%v0_2 = load double , ptr %from_1
%v1_1 = fadd double %v0_1, %v0_2
%v1_2 = fadd double %p, %v0_1
%to_2 = getelementptr double, ptr %to, i64 1
store double %v1_1, ptr %to
store double %v1_2, ptr %to_2
br i1 %c, label %lp, label %ext
ext:
ret void
}
define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1, double %v2, i1 %c) {
; CHECK-LABEL: @shuffle_nodes_match2(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LP:%.*]]
; CHECK: lp:
; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
; CHECK-NEXT: [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
; CHECK-NEXT: [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
; CHECK-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; CHECK: ext:
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @shuffle_nodes_match2(
; SSE2-NEXT: entry:
; SSE2-NEXT: br label [[LP:%.*]]
; SSE2: lp:
; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
; SSE2-NEXT: br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
; SSE2: ext:
; SSE2-NEXT: ret void
;
entry:
br label %lp
lp:
%p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
%from_1 = getelementptr double, ptr %from, i64 1
%v0_1 = load double , ptr %from
%v0_2 = load double , ptr %from_1
%v1_1 = fadd double %v0_1, %v0_2
%v1_2 = fadd double %v0_1, %p
%to_2 = getelementptr double, ptr %to, i64 1
store double %v1_1, ptr %to
store double %v1_2, ptr %to_2
br i1 %c, label %lp, label %ext
ext:
ret void
}
; Make sure we don't scramble operands when we reorder them and destroy
; 'good' source order.
@a = common global [32000 x float] zeroinitializer, align 16
define void @good_load_order() {
; CHECK-LABEL: @good_load_order(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
; CHECK: for.cond1.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16
; CHECK-NEXT: br label [[FOR_BODY3:%.*]]
; CHECK: for.body3:
; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4
; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
; CHECK-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @good_load_order(
; SSE2-NEXT: entry:
; SSE2-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
; SSE2: for.cond1.preheader:
; SSE2-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16
; SSE2-NEXT: br label [[FOR_BODY3:%.*]]
; SSE2: for.body3:
; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ]
; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1
; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP3]]
; SSE2-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; SSE2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP4]]
; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4
; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]]
; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0
; SSE2-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]]
; SSE2-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4
; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
; SSE2-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]]
; SSE2-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4
; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3
; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]]
; SSE2-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
; SSE2-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995
; SSE2-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
; SSE2: for.end:
; SSE2-NEXT: ret void
;
entry:
br label %for.cond1.preheader
for.cond1.preheader:
%0 = load float, ptr @a, align 16
br label %for.body3
for.body3:
%1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
%indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
%2 = add nsw i64 %indvars.iv, 1
%arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2
%3 = load float, ptr %arrayidx, align 4
%arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv
%mul6 = fmul float %3, %1
store float %mul6, ptr %arrayidx5, align 4
%4 = add nsw i64 %indvars.iv, 2
%arrayidx11 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %4
%5 = load float, ptr %arrayidx11, align 4
%mul15 = fmul float %5, %3
store float %mul15, ptr %arrayidx, align 4
%6 = add nsw i64 %indvars.iv, 3
%arrayidx21 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %6
%7 = load float, ptr %arrayidx21, align 4
%mul25 = fmul float %7, %5
store float %mul25, ptr %arrayidx11, align 4
%8 = add nsw i64 %indvars.iv, 4
%arrayidx31 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %8
%9 = load float, ptr %arrayidx31, align 4
%mul35 = fmul float %9, %7
store float %mul35, ptr %arrayidx21, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
%arrayidx41 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv.next
%10 = load float, ptr %arrayidx41, align 4
%mul45 = fmul float %10, %9
store float %mul45, ptr %arrayidx31, align 4
%11 = trunc i64 %indvars.iv.next to i32
%cmp2 = icmp slt i32 %11, 31995
br i1 %cmp2, label %for.body3, label %for.end
for.end:
ret void
}
; Check vectorization of following code for double data type-
; c[0] = a[0]+b[0];
; c[1] = b[1]+a[1]; // swapped b[1] and a[1]
define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
; CHECK-LABEL: @load_reorder_double(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @load_reorder_double(
; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4
; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4
; SSE2-NEXT: ret void
;
%1 = load double, ptr %a
%2 = load double, ptr %b
%3 = fadd double %1, %2
store double %3, ptr %c
%4 = getelementptr inbounds double, ptr %b, i64 1
%5 = load double, ptr %4
%6 = getelementptr inbounds double, ptr %a, i64 1
%7 = load double, ptr %6
%8 = fadd double %5, %7
%9 = getelementptr inbounds double, ptr %c, i64 1
store double %8, ptr %9
ret void
}
; Check vectorization of following code for float data type-
; c[0] = a[0]+b[0];
; c[1] = b[1]+a[1]; // swapped b[1] and a[1]
; c[2] = a[2]+b[2];
; c[3] = a[3]+b[3];
define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){
; CHECK-LABEL: @load_reorder_float(
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @load_reorder_float(
; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4
; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
; SSE2-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4
; SSE2-NEXT: ret void
;
%1 = load float, ptr %a
%2 = load float, ptr %b
%3 = fadd float %1, %2
store float %3, ptr %c
%4 = getelementptr inbounds float, ptr %b, i64 1
%5 = load float, ptr %4
%6 = getelementptr inbounds float, ptr %a, i64 1
%7 = load float, ptr %6
%8 = fadd float %5, %7
%9 = getelementptr inbounds float, ptr %c, i64 1
store float %8, ptr %9
%10 = getelementptr inbounds float, ptr %a, i64 2
%11 = load float, ptr %10
%12 = getelementptr inbounds float, ptr %b, i64 2
%13 = load float, ptr %12
%14 = fadd float %11, %13
%15 = getelementptr inbounds float, ptr %c, i64 2
store float %14, ptr %15
%16 = getelementptr inbounds float, ptr %a, i64 3
%17 = load float, ptr %16
%18 = getelementptr inbounds float, ptr %b, i64 3
%19 = load float, ptr %18
%20 = fadd float %17, %19
%21 = getelementptr inbounds float, ptr %c, i64 3
store float %20, ptr %21
ret void
}
; Check we properly reorder the below code so that it gets vectorized optimally-
; a[0] = (b[0]+c[0])+d[0];
; a[1] = d[1]+(b[1]+c[1]);
; a[2] = (b[2]+c[2])+d[2];
; a[3] = (b[3]+c[3])+d[3];
define void @opcode_reorder(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c,ptr noalias nocapture readonly %d) {
; CHECK-LABEL: @opcode_reorder(
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
; CHECK-NEXT: ret void
;
; SSE2-LABEL: @opcode_reorder(
; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4
; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4
; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4
; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]]
; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4
; SSE2-NEXT: ret void
;
%1 = load float, ptr %b
%2 = load float, ptr %c
%3 = fadd float %1, %2
%4 = load float, ptr %d
%5 = fadd float %3, %4
store float %5, ptr %a
%6 = getelementptr inbounds float, ptr %d, i64 1
%7 = load float, ptr %6
%8 = getelementptr inbounds float, ptr %b, i64 1
%9 = load float, ptr %8
%10 = getelementptr inbounds float, ptr %c, i64 1
%11 = load float, ptr %10
%12 = fadd float %9, %11
%13 = fadd float %7, %12
%14 = getelementptr inbounds float, ptr %a, i64 1
store float %13, ptr %14
%15 = getelementptr inbounds float, ptr %b, i64 2
%16 = load float, ptr %15
%17 = getelementptr inbounds float, ptr %c, i64 2
%18 = load float, ptr %17
%19 = fadd float %16, %18
%20 = getelementptr inbounds float, ptr %d, i64 2
%21 = load float, ptr %20
%22 = fadd float %19, %21
%23 = getelementptr inbounds float, ptr %a, i64 2
store float %22, ptr %23
%24 = getelementptr inbounds float, ptr %b, i64 3
%25 = load float, ptr %24
%26 = getelementptr inbounds float, ptr %c, i64 3
%27 = load float, ptr %26
%28 = fadd float %25, %27
%29 = getelementptr inbounds float, ptr %d, i64 3
%30 = load float, ptr %29
%31 = fadd float %28, %30
%32 = getelementptr inbounds float, ptr %a, i64 3
store float %31, ptr %32
ret void
}