The patch tries to keep the original order of the instruction in the reductions. Previously, two first instructions were switched, giving reverse order. The first step to support of the ordered reductions. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/98025
219 lines
11 KiB
LLVM
219 lines
11 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
|
|
; RUN: cat %t | FileCheck -check-prefix=YAML %s
|
|
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
|
|
; RUN: cat %t | FileCheck -check-prefix=YAML %s
|
|
|
|
; These tests check that we remove from consideration pairs of seed
|
|
; getelementptrs when they are known to have a constant difference. Such pairs
|
|
; are likely not good candidates for vectorization since one can be computed
|
|
; from the other. We use an unprofitable threshold to force vectorization.
|
|
;
|
|
; int getelementptr(int *g, int n, int w, int x, int y, int z) {
|
|
; int sum = 0;
|
|
; for (int i = 0; i < n ; ++i) {
|
|
; sum += g[2*i + w]; sum += g[2*i + x];
|
|
; sum += g[2*i + y]; sum += g[2*i + z];
|
|
; }
|
|
; return sum;
|
|
; }
|
|
;
|
|
|
|
; YAML-LABEL: Function: getelementptr_4x32
|
|
; YAML: --- !Passed
|
|
; YAML-NEXT: Pass: slp-vectorizer
|
|
; YAML-NEXT: Name: VectorizedList
|
|
; YAML-NEXT: Function: getelementptr_4x32
|
|
; YAML-NEXT: Args:
|
|
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
|
; YAML-NEXT: - Cost: '4'
|
|
; YAML-NEXT: - String: ' and with tree size '
|
|
; YAML-NEXT: - TreeSize: '3'
|
|
|
|
; YAML: --- !Passed
|
|
; YAML-NEXT: Pass: slp-vectorizer
|
|
; YAML-NEXT: Name: VectorizedList
|
|
; YAML-NEXT: Function: getelementptr_4x32
|
|
; YAML-NEXT: Args:
|
|
; YAML-NEXT: - String: 'SLP vectorized with cost '
|
|
; YAML-NEXT: - Cost: '6'
|
|
; YAML-NEXT: - String: ' and with tree size '
|
|
; YAML-NEXT: - TreeSize: '3'
|
|
|
|
define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
|
|
; CHECK-LABEL: @getelementptr_4x32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i64 1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i64 1
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.cond.cleanup.loopexit:
|
|
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
|
|
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0
|
|
; CHECK-NEXT: [[TMP7:%.*]] = zext nneg i32 [[TMP6]] to i64
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP7]]
|
|
; CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1
|
|
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
|
|
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
|
|
; CHECK-NEXT: [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
|
|
; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0
|
|
; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
|
|
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP12]]
|
|
; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
|
|
; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
|
|
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1
|
|
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
|
|
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP14]]
|
|
; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
|
|
; CHECK-NEXT: [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
|
|
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
|
|
;
|
|
entry:
|
|
%cmp31 = icmp sgt i32 %n, 0
|
|
br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader:
|
|
br label %for.body
|
|
|
|
for.cond.cleanup.loopexit:
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup:
|
|
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
|
|
ret i32 %sum.0.lcssa
|
|
|
|
for.body:
|
|
%indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
|
%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
|
|
%t4 = shl nsw i32 %indvars.iv, 1
|
|
%t5 = add nsw i32 %t4, 0
|
|
%arrayidx = getelementptr inbounds i32, ptr %g, i32 %t5
|
|
%t6 = load i32, ptr %arrayidx, align 4
|
|
%add1 = add nsw i32 %t6, %sum.032
|
|
%t7 = add nsw i32 %t4, %x
|
|
%arrayidx5 = getelementptr inbounds i32, ptr %g, i32 %t7
|
|
%t8 = load i32, ptr %arrayidx5, align 4
|
|
%add6 = add nsw i32 %add1, %t8
|
|
%t9 = add nsw i32 %t4, %y
|
|
%arrayidx10 = getelementptr inbounds i32, ptr %g, i32 %t9
|
|
%t10 = load i32, ptr %arrayidx10, align 4
|
|
%add11 = add nsw i32 %add6, %t10
|
|
%t11 = add nsw i32 %t4, %z
|
|
%arrayidx15 = getelementptr inbounds i32, ptr %g, i32 %t11
|
|
%t12 = load i32, ptr %arrayidx15, align 4
|
|
%add16 = add nsw i32 %add11, %t12
|
|
%indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
|
|
%exitcond = icmp eq i32 %indvars.iv.next , %n
|
|
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
|
|
}
|
|
|
|
; YAML-LABEL: Function: getelementptr_2x32
|
|
; YAML: --- !Passed
|
|
; YAML: Pass: slp-vectorizer
|
|
; YAML: Name: VectorizedList
|
|
; YAML: Function: getelementptr_2x32
|
|
; YAML: Args:
|
|
; YAML: - String: 'SLP vectorized with cost '
|
|
; YAML: - Cost: '4'
|
|
; YAML-NEXT: - String: ' and with tree size '
|
|
; YAML-NEXT: - TreeSize: '3'
|
|
|
|
define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
|
|
; CHECK-LABEL: @getelementptr_2x32(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
|
|
; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
|
|
; CHECK: for.body.preheader:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i64 1
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.cond.cleanup.loopexit:
|
|
; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
|
|
; CHECK: for.cond.cleanup:
|
|
; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
|
|
; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
|
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
|
|
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i64 0
|
|
; CHECK-NEXT: [[TMP5:%.*]] = zext nneg i32 [[TMP4]] to i64
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP5]]
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP3]], i64 1
|
|
; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
|
|
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP8]]
|
|
; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
|
|
; CHECK-NEXT: [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[T11]] to i64
|
|
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
|
|
; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
|
|
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
|
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T10]], i64 2
|
|
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T12]], i64 3
|
|
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
|
|
; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP13]], [[SUM_032]]
|
|
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
|
|
;
|
|
entry:
|
|
%cmp31 = icmp sgt i32 %n, 0
|
|
br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader:
|
|
br label %for.body
|
|
|
|
for.cond.cleanup.loopexit:
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup:
|
|
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
|
|
ret i32 %sum.0.lcssa
|
|
|
|
for.body:
|
|
%indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
|
%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
|
|
%t4 = shl nsw i32 %indvars.iv, 1
|
|
%t5 = add nsw i32 %t4, 0
|
|
%arrayidx = getelementptr inbounds i32, ptr %g, i32 %t5
|
|
%t6 = load i32, ptr %arrayidx, align 4
|
|
%add1 = add nsw i32 %t6, %sum.032
|
|
%t7 = add nsw i32 %t4, 1
|
|
%arrayidx5 = getelementptr inbounds i32, ptr %g, i32 %t7
|
|
%t8 = load i32, ptr %arrayidx5, align 4
|
|
%add6 = add nsw i32 %add1, %t8
|
|
%t9 = add nsw i32 %t4, %y
|
|
%arrayidx10 = getelementptr inbounds i32, ptr %g, i32 %t9
|
|
%t10 = load i32, ptr %arrayidx10, align 4
|
|
%add11 = add nsw i32 %add6, %t10
|
|
%t11 = add nsw i32 %t4, %z
|
|
%arrayidx15 = getelementptr inbounds i32, ptr %g, i32 %t11
|
|
%t12 = load i32, ptr %arrayidx15, align 4
|
|
%add16 = add nsw i32 %add11, %t12
|
|
%indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
|
|
%exitcond = icmp eq i32 %indvars.iv.next , %n
|
|
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
|
|
}
|