Files
clang-p2996/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
Philip Reames b12930e133 [RISCV] Switch to using get.active.lane.mask when tail folding
The motivation here is to a) bring us closer into alignment with AArch64 under the assumption that codepath is better tested, and b) simplify pattern matching in an upcoming change.

The immediate impact is a significant IR reduction but a fairly minimal change in the generated assembly. Due to a difference in expansion behavior we get a saturating add vs an unsaturating one for the old code, but that's about it. This difference comes down to different handling of overflow, which doesn't seem to be possible here anyways, so the assembly codegen is arguably a minor regression. I don't expect that to matter in practice.

Differential Revision: https://reviews.llvm.org/D129221
2022-07-08 10:24:59 -07:00

264 lines
12 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -prefer-predicate-over-epilogue=predicate-dont-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=CHECK
; Exercise tail folding on RISCV w/scalable vectors.
target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
target triple = "riscv64"
define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-LABEL: @vector_add(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]]
; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024)
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[TMP7]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]], <vscale x 1 x i64> poison)
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 1 x i64> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[TMP8]], ptr [[TMP7]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
%elem = load i64, ptr %arrayidx
%add = add i64 %elem, %v
store i64 %add, ptr %arrayidx
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.end, label %for.body
for.end:
ret void
}
; a[b[i]] = v, exercise scatter support
define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
; CHECK-LABEL: @indexed_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
; CHECK-NEXT: [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
; CHECK-NEXT: [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
; CHECK-NEXT: store i64 [[V:%.*]], ptr [[AADDR]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%baddr = getelementptr inbounds i64, ptr %b, i64 %iv
%aidx = load i64, ptr %baddr
%aaddr = getelementptr inbounds i64, ptr %a, i64 %aidx
store i64 %v, ptr %aaddr
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.end, label %for.body
for.end:
ret void
}
define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
; CHECK-LABEL: @indexed_load(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
; CHECK-NEXT: [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
; CHECK-NEXT: [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ]
; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]]
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%sum = phi i64 [0, %entry], [%sum.next, %for.body]
%baddr = getelementptr inbounds i64, ptr %b, i64 %iv
%aidx = load i64, ptr %baddr
%aaddr = getelementptr inbounds i64, ptr %a, i64 %aidx
%elem = load i64, ptr %aaddr
%iv.next = add nuw nsw i64 %iv, 1
%sum.next = add i64 %sum, %elem
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.end, label %for.body
for.end:
ret i64 %sum.next
}
define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
; CHECK-LABEL: @splat_int(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 -1025, [[TMP0]]
; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 [[TMP5]], i64 1024)
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, <vscale x 1 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
store i64 %v, ptr %arrayidx
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.end, label %for.body
for.end:
ret void
}
define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
; CHECK-LABEL: @uniform_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: store i64 [[V:%.*]], ptr [[B:%.*]], align 8
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
store i64 %v, ptr %b, align 8
%arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
store i64 %v, ptr %arrayidx
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.end, label %for.body
for.end:
ret void
}
define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
; CHECK-LABEL: @uniform_load(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
; CHECK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ]
; CHECK-NEXT: ret i64 [[V_LCSSA]]
;
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%v = load i64, ptr %b, align 8
%arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
store i64 %v, ptr %arrayidx
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.end, label %for.body
for.end:
ret i64 %v
}