Files
clang-p2996/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-half.ll
Sam Tebbs 70501ed2f0 [LoopVectorizer] Prune VFs based on plan register pressure (#132190)
This PR moves the register usage checking to after the plans are
created, so that any recipes that optimise register usage (such as
partial reductions) can be properly costed and not have their VF pruned
unnecessarily.

Depends on https://github.com/llvm/llvm-project/pull/137746
2025-05-19 13:27:17 +01:00

143 lines
6.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -passes=loop-vectorize -debug-only=loop-vectorize -mattr=avx512fp16 %s 2>&1 | FileCheck %s
; REQUIRES: asserts
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386-unknown-linux-gnu"
@src = common local_unnamed_addr global [120 x half] zeroinitializer, align 4
@dst = common local_unnamed_addr global [120 x half] zeroinitializer, align 4
; Function Attrs: norecurse nounwind
define void @stride8(half %k, i32 %width_) {
entry:
; CHECK: Cost of 148 for VF 32: INTERLEAVE-GROUP with factor 8 at %0, ir<%arrayidx>
%cmp72 = icmp sgt i32 %width_, 0
br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds = %for.body.lr.ph, %for.body
%i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
%arrayidx = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %i.073
%0 = load half, ptr %arrayidx, align 4
%mul = fmul fast half %0, %k
%arrayidx2 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %i.073
%1 = load half, ptr %arrayidx2, align 4
%add3 = fadd fast half %1, %mul
store half %add3, ptr %arrayidx2, align 4
%add4 = or disjoint i32 %i.073, 1
%arrayidx5 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add4
%2 = load half, ptr %arrayidx5, align 4
%mul6 = fmul fast half %2, %k
%arrayidx8 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add4
%3 = load half, ptr %arrayidx8, align 4
%add9 = fadd fast half %3, %mul6
store half %add9, ptr %arrayidx8, align 4
%add10 = or disjoint i32 %i.073, 2
%arrayidx11 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add10
%4 = load half, ptr %arrayidx11, align 4
%mul12 = fmul fast half %4, %k
%arrayidx14 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add10
%5 = load half, ptr %arrayidx14, align 4
%add15 = fadd fast half %5, %mul12
store half %add15, ptr %arrayidx14, align 4
%add16 = or disjoint i32 %i.073, 3
%arrayidx17 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add16
%6 = load half, ptr %arrayidx17, align 4
%mul18 = fmul fast half %6, %k
%arrayidx20 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add16
%7 = load half, ptr %arrayidx20, align 4
%add21 = fadd fast half %7, %mul18
store half %add21, ptr %arrayidx20, align 4
%add22 = or disjoint i32 %i.073, 4
%arrayidx23 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add22
%8 = load half, ptr %arrayidx23, align 4
%mul24 = fmul fast half %8, %k
%arrayidx26 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add22
%9 = load half, ptr %arrayidx26, align 4
%add27 = fadd fast half %9, %mul24
store half %add27, ptr %arrayidx26, align 4
%add28 = or disjoint i32 %i.073, 5
%arrayidx29 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add28
%10 = load half, ptr %arrayidx29, align 4
%mul30 = fmul fast half %10, %k
%arrayidx32 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add28
%11 = load half, ptr %arrayidx32, align 4
%add33 = fadd fast half %11, %mul30
store half %add33, ptr %arrayidx32, align 4
%add34 = or disjoint i32 %i.073, 6
%arrayidx35 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add34
%12 = load half, ptr %arrayidx35, align 4
%mul36 = fmul fast half %12, %k
%arrayidx38 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add34
%13 = load half, ptr %arrayidx38, align 4
%add39 = fadd fast half %13, %mul36
store half %add39, ptr %arrayidx38, align 4
%add40 = or disjoint i32 %i.073, 7
%arrayidx41 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add40
%14 = load half, ptr %arrayidx41, align 4
%mul42 = fmul fast half %14, %k
%arrayidx44 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add40
%15 = load half, ptr %arrayidx44, align 4
%add45 = fadd fast half %15, %mul42
store half %add45, ptr %arrayidx44, align 4
%add46 = add nuw nsw i32 %i.073, 8
%cmp = icmp slt i32 %add46, %width_
br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}
; Function Attrs: norecurse nounwind
define void @stride3(half %k, i32 %width_) {
entry:
; CHECK: LV: Found an estimated cost of 18 for VF 32 For instruction: %0 = load half
%cmp27 = icmp sgt i32 %width_, 0
br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
for.body.lr.ph: ; preds = %entry
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret void
for.body: ; preds = %for.body.lr.ph, %for.body
%i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
%arrayidx = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %i.028
%0 = load half, ptr %arrayidx, align 4
%mul = fmul fast half %0, %k
%arrayidx2 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %i.028
%1 = load half, ptr %arrayidx2, align 4
%add3 = fadd fast half %1, %mul
store half %add3, ptr %arrayidx2, align 4
%add4 = add nuw nsw i32 %i.028, 1
%arrayidx5 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add4
%2 = load half, ptr %arrayidx5, align 4
%mul6 = fmul fast half %2, %k
%arrayidx8 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add4
%3 = load half, ptr %arrayidx8, align 4
%add9 = fadd fast half %3, %mul6
store half %add9, ptr %arrayidx8, align 4
%add10 = add nuw nsw i32 %i.028, 2
%arrayidx11 = getelementptr inbounds [120 x half], ptr @src, i32 0, i32 %add10
%4 = load half, ptr %arrayidx11, align 4
%mul12 = fmul fast half %4, %k
%arrayidx14 = getelementptr inbounds [120 x half], ptr @dst, i32 0, i32 %add10
%5 = load half, ptr %arrayidx14, align 4
%add15 = fadd fast half %5, %mul12
store half %add15, ptr %arrayidx14, align 4
%add16 = add nuw nsw i32 %i.028, 3
%cmp = icmp slt i32 %add16, %width_
br i1 %cmp, label %for.body, label %for.cond.cleanup
}