Files
clang-p2996/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
Peter Waller ade47bdc31 [LV] Improve register pressure estimate at high VFs
Previously, `getRegUsageForType` was implemented using
`getTypeLegalizationCost`.  `getRegUsageForType` is used by the loop
vectorizer to estimate the register pressure caused by using a vector
type.  However, `getTypeLegalizationCost` currently only appears to
understand splitting and not scalarization, so significantly
underestimates the register requirements.

Instead, use `getNumRegisters`, which understands when scalarization
can occur (via computeRegisterProperties).

This was discovered while investigating D118979 (Set maximum VF with
shouldMaximizeVectorBandwidth), where under fixed-length 512-bit SVE the
loop vectorizer previously ends up costing an v128i1 as 2 v64i*
registers where it actually occupies 128 i32 registers.

I'm sending this patch early for comment, I'm still doing some sanity checking
with LNT.  I note that getRegisterClassForType appears to return VectorRC even
though the type in question (large vNi1 types) end up occupying scalar
registers. That might be worth fixing too.

Differential Revision: https://reviews.llvm.org/D125918
2022-05-23 07:57:45 +00:00

33 lines
1.2 KiB
LLVM

; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s
; REQUIRES: asserts
target triple = "x86_64"
; Test that shows how many registers the loop vectorizer thinks an illegal <VF x i1> will consume.
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
; CHECK: LV(REG): VF = 64
; CHECK-NEXT: LV(REG): Found max usage: 2 item
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
entry:
br label %loop
exit:
ret i1 %reduction_next
loop:
%induction = phi i32 [ 0, %entry ], [ %induction_next, %loop ]
%reduction = phi i1 [ 0, %entry ], [ %reduction_next, %loop ]
%gep = getelementptr inbounds i32, ptr %ptr, i32 %induction
%loaded = load i32, ptr %gep
%i1 = icmp eq i32 %loaded, %induction
%reduction_next = or i1 %i1, %reduction
%induction_next = add nuw i32 %induction, 1
%cond = icmp eq i32 %induction_next, %arg
br i1 %cond, label %exit, label %loop, !llvm.loop !64
}
!64 = distinct !{!64, !65}
!65 = !{!"llvm.loop.vectorize.width", i32 64}