On 64bit targets we can promote i32 CTTZ nodes to i64 CTTZ_ZERO_UNDEF by setting the 32nd bit. #57811 also queried about whether we should use BTS instead of MOVABS+OR to avoid a i64 immediate - I'm willing to tweak the DAGToDAG isel peephole for these cases if reviewers think it worthwhile. But most recent CPUs can actually handle MOVABS faster than BTS/C/R....... Reapplied with missing costmodel changes - the cost tables can probably be improved in a follow up patch. Fixes #57811
56 lines
2.7 KiB
LLVM
56 lines
2.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX1
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
|
|
|
|
; PR90847 - failure to peek through FREEZE(SETCC()) results in VPMOVSMSKB(TRUNC()) instead of VMOVMSKPS
|
|
|
|
define i32 @PR90847(<8 x float> %x) nounwind {
|
|
; AVX1-LABEL: PR90847:
|
|
; AVX1: # %bb.0: # %entry
|
|
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6]
|
|
; AVX1-NEXT: vminps %ymm1, %ymm0, %ymm1
|
|
; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,0,3,2]
|
|
; AVX1-NEXT: vminps %ymm2, %ymm1, %ymm1
|
|
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
|
|
; AVX1-NEXT: vminps %ymm2, %ymm1, %ymm1
|
|
; AVX1-NEXT: vcmpeqps %ymm0, %ymm1, %ymm0
|
|
; AVX1-NEXT: vmovmskps %ymm0, %eax
|
|
; AVX1-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
|
|
; AVX1-NEXT: orq %rax, %rcx
|
|
; AVX1-NEXT: rep bsfq %rcx, %rax
|
|
; AVX1-NEXT: # kill: def $eax killed $eax killed $rax
|
|
; AVX1-NEXT: vzeroupper
|
|
; AVX1-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: PR90847:
|
|
; AVX2: # %bb.0: # %entry
|
|
; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6]
|
|
; AVX2-NEXT: vminps %ymm1, %ymm0, %ymm1
|
|
; AVX2-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,0,3,2]
|
|
; AVX2-NEXT: vminps %ymm2, %ymm1, %ymm1
|
|
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[2,3,0,1]
|
|
; AVX2-NEXT: vminps %ymm2, %ymm1, %ymm1
|
|
; AVX2-NEXT: vcmpeqps %ymm0, %ymm1, %ymm0
|
|
; AVX2-NEXT: vmovmskps %ymm0, %eax
|
|
; AVX2-NEXT: movabsq $4294967296, %rcx # imm = 0x100000000
|
|
; AVX2-NEXT: orq %rax, %rcx
|
|
; AVX2-NEXT: rep bsfq %rcx, %rax
|
|
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
entry:
|
|
%shuf1 = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
|
|
%min1 = tail call noundef <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %x, <8 x float> %shuf1)
|
|
%shuf2 = shufflevector <8 x float> %min1, <8 x float> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
|
|
%min2 = tail call noundef <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %min1, <8 x float> %shuf2)
|
|
%shuf3 = shufflevector <8 x float> %min2, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
|
|
%min3 = tail call noundef <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %min2, <8 x float> %shuf3)
|
|
%fcmp = fcmp oeq <8 x float> %min3, %x
|
|
%mask = bitcast <8 x i1> %fcmp to i8
|
|
%zext = zext i8 %mask to i32
|
|
%cmp = icmp eq i8 %mask, 0
|
|
%tz = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %zext, i1 false)
|
|
%conv = select i1 %cmp, i32 undef, i32 %tz
|
|
ret i32 %conv
|
|
}
|