Files
clang-p2996/llvm/test/CodeGen/X86/shift-coalesce.ll
Sanjay Patel f0dd12ec5c [x86] use zero-extending load of a byte outside of loops too (2nd try)
The first attempt missed changing test files for tools
(update_llc_test_checks.py).

Original commit message:

This implements the main suggested change from issue #56498.
Using the shorter (non-extending) instruction with only
-Oz ("minsize") rather than -Os ("optsize") is left as a
possible follow-up.

As noted in the bug report, the zero-extending load may have
shorter latency/better throughput across a wide range of x86
micro-arches, and it avoids a potential false dependency.
The cost is an extra instruction byte.

This could cause perf ups and downs from secondary effects,
but I don't think it is possible to account for those in
advance, and that will likely also depend on exact micro-arch.
This does bring LLVM x86 codegen more in line with existing
gcc codegen, so if problems are exposed they are more likely
to occur for both compilers.

Differential Revision: https://reviews.llvm.org/D129775
2022-07-19 21:27:08 -04:00

35 lines
1.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | FileCheck %s
; PR687
define i64 @foo(i64 %x, ptr %X) {
; CHECK-LABEL: foo:
; CHECK: # %bb.0:
; CHECK-NEXT: push esi
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: .cfi_offset esi, -8
; CHECK-NEXT: mov esi, dword ptr [esp + 8]
; CHECK-NEXT: mov edx, dword ptr [esp + 12]
; CHECK-NEXT: mov eax, dword ptr [esp + 16]
; CHECK-NEXT: movzx ecx, byte ptr [eax]
; CHECK-NEXT: mov eax, esi
; CHECK-NEXT: shl eax, cl
; CHECK-NEXT: shld edx, esi, cl
; CHECK-NEXT: test cl, 32
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mov edx, eax
; CHECK-NEXT: xor eax, eax
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: pop esi
; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: ret
%tmp.1 = load i64, ptr %X ; <i64> [#uses=1]
%tmp.3 = trunc i64 %tmp.1 to i8 ; <i8> [#uses=1]
%shift.upgrd.1 = zext i8 %tmp.3 to i64 ; <i64> [#uses=1]
%tmp.4 = shl i64 %x, %shift.upgrd.1 ; <i64> [#uses=1]
ret i64 %tmp.4
}