Files
clang-p2996/llvm/test/CodeGen/AArch64/avoid-free-ext-promotion.ll
Eli Friedman c83f23d6ab [AArch64] Fix heuristics for folding "lsl" into load/store ops. (#86894)
The existing heuristics were assuming that every core behaves like an
Apple A7, where any extend/shift costs an extra micro-op... but in
reality, nothing else behaves like that.

On some older Cortex designs, shifts by 1 or 4 cost extra, but all other
shifts/extensions are free. On all other cores, as far as I can tell,
all shifts/extensions for integer loads are free (i.e. the same cost as
an unshifted load).

To reflect this, this patch:

- Enables aggressive folding of shifts into loads by default.

- Removes the old AddrLSLFast feature, since it applies to everything
except A7 (and even if you are explicitly targeting A7, we want to
assume extensions are free because the code will almost always run on a
newer core).

- Adds a new feature AddrLSLSlow14 that applies specifically to the
Cortex cores where shifts by 1 or 4 cost extra.

I didn't add support for AddrLSLSlow14 on the GlobalISel side because it
would require a bunch of refactoring to work correctly. Someone can pick
this up as a followup.
2024-04-04 11:25:44 -07:00

146 lines
5.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=arm64-apple-macosx -o - %s | FileCheck %s
%struct.zot = type { ptr, i32, ptr, ptr, i32, i32, i32, i32, i32, i32 }
; FIXME: currently the AND is performed on 64 bit unnecessarily.
; Test cases where an AND is extended from i32 -> i64 which is free. Make
; sure the extends do not get moved to the arguments, which would perform the
; AND on 64 bits unnecessarily.
define void @avoid_promotion_1_and(ptr nocapture noundef %arg, ptr %p) {
; CHECK-LABEL: avoid_promotion_1_and:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: ldr w8, [x0, #52]
; CHECK-NEXT: mov w9, #10 ; =0xa
; CHECK-NEXT: LBB0_1: ; %bb8
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmp w8, #3
; CHECK-NEXT: b.lo LBB0_1
; CHECK-NEXT: ; %bb.2: ; %bb9
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: ldr w10, [x0, #32]
; CHECK-NEXT: ldr w11, [x1, #76]
; CHECK-NEXT: ldr w12, [x1]
; CHECK-NEXT: eor w10, w10, w11
; CHECK-NEXT: and w10, w10, w12
; CHECK-NEXT: str w10, [x0, #32]
; CHECK-NEXT: strh w9, [x1, w10, uxtw #1]
; CHECK-NEXT: b LBB0_1
bb:
%gep = getelementptr inbounds %struct.zot, ptr %arg, i64 0, i32 9
%l = load i32, ptr %gep, align 4
%icmp = icmp ugt i32 %l, 2
%gep1 = getelementptr inbounds %struct.zot, ptr %arg, i64 0, i32 4
%gep2 = getelementptr inbounds %struct.zot, ptr %arg, i64 0, i32 7
br label %bb8
bb8: ; preds = %bb27, %bb
br i1 %icmp, label %bb9, label %bb27
bb9: ; preds = %bb8
%l10 = load i32, ptr %gep1, align 8
%gep14 = getelementptr inbounds i32, ptr %p, i64 19
%l15 = load i32, ptr %gep14, align 1
%xor = xor i32 %l10, %l15
%l17 = load i32, ptr %p, align 8
%and = and i32 %xor, %l17
store i32 %and, ptr %gep1, align 8
%zext19 = zext i32 %and to i64
%gep20 = getelementptr inbounds i16, ptr %p, i64 %zext19
store i16 10, ptr %gep20, align 2
br label %bb27
bb27: ; preds = %bb9, %bb8
br label %bb8
}
define void @avoid_promotion_2_and(ptr nocapture noundef %arg) {
; CHECK-LABEL: avoid_promotion_2_and:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: add x8, x0, #32
; CHECK-NEXT: b LBB1_2
; CHECK-NEXT: LBB1_1: ; %latch
; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: cmp w9, #2
; CHECK-NEXT: add x8, x8, #56
; CHECK-NEXT: b.ls LBB1_4
; CHECK-NEXT: LBB1_2: ; %loop
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr w9, [x8, #20]
; CHECK-NEXT: cmp w9, #3
; CHECK-NEXT: b.lo LBB1_1
; CHECK-NEXT: ; %bb.3: ; %then
; CHECK-NEXT: ; in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: ldp w13, w12, [x8, #12]
; CHECK-NEXT: ldr w10, [x8]
; CHECK-NEXT: ldr x11, [x0]
; CHECK-NEXT: ldr w14, [x8, #8]
; CHECK-NEXT: lsl w10, w10, w13
; CHECK-NEXT: ldrb w11, [x11, x12]
; CHECK-NEXT: eor w10, w10, w11
; CHECK-NEXT: ldur w11, [x8, #-24]
; CHECK-NEXT: and w10, w10, w14
; CHECK-NEXT: ldp x14, x13, [x8, #-16]
; CHECK-NEXT: str w10, [x8]
; CHECK-NEXT: and w11, w11, w12
; CHECK-NEXT: ldrh w15, [x13, w10, uxtw #1]
; CHECK-NEXT: strh w15, [x14, w11, uxtw #1]
; CHECK-NEXT: strh w12, [x13, w10, uxtw #1]
; CHECK-NEXT: b LBB1_1
; CHECK-NEXT: LBB1_4: ; %exit
; CHECK-NEXT: ret
entry:
br label %loop
loop:
%p = phi i64 [ 0, %entry ], [ %p.next, %latch ]
%gep = getelementptr inbounds %struct.zot, ptr %arg, i64 %p, i32 9
%l = load i32, ptr %gep, align 4
%icmp = icmp ugt i32 %l, 2
%gep1 = getelementptr inbounds %struct.zot, ptr %arg, i64 %p, i32 4
%gep2 = getelementptr inbounds %struct.zot, ptr %arg, i64 %p, i32 7
%gep3 = getelementptr inbounds %struct.zot, ptr %arg, i64 %p, i32 8
%gep4 = getelementptr inbounds %struct.zot, ptr %arg, i64 %p, i32 6
%gep5 = getelementptr inbounds %struct.zot, ptr %arg, i64 %p, i32 3
%gep6 = getelementptr inbounds %struct.zot, ptr %arg, i64 %p, i32 2
%gep7 = getelementptr inbounds %struct.zot, ptr %arg, i64 %p, i32 1
br i1 %icmp, label %then, label %latch
then:
%l10 = load i32, ptr %gep1, align 8
%l11 = load i32, ptr %gep2, align 4
%shl = shl i32 %l10, %l11
%l12 = load ptr, ptr %arg, align 8
%l13 = load i32, ptr %gep3, align 8
%zext = zext i32 %l13 to i64
%gep14 = getelementptr inbounds i8, ptr %l12, i64 %zext
%l15 = load i8, ptr %gep14, align 1
%zext16 = zext i8 %l15 to i32
%xor = xor i32 %shl, %zext16
%l17 = load i32, ptr %gep4, align 8
%and = and i32 %xor, %l17
store i32 %and, ptr %gep1, align 8
%l18 = load ptr, ptr %gep5, align 8
%zext19 = zext i32 %and to i64
%gep20 = getelementptr inbounds i16, ptr %l18, i64 %zext19
%l21 = load i16, ptr %gep20, align 2
%l22 = load ptr, ptr %gep6, align 8
%l23 = load i32, ptr %gep7, align 8
%and24 = and i32 %l23, %l13
%zext25 = zext i32 %and24 to i64
%gep26 = getelementptr inbounds i16, ptr %l22, i64 %zext25
store i16 %l21, ptr %gep26, align 2
%trunc = trunc i32 %l13 to i16
store i16 %trunc, ptr %gep20, align 2
br label %latch
latch:
%p.next = add i64 %p, 1
br i1 %icmp, label %loop, label %exit
exit:
ret void
}