Files
clang-p2996/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
Jay Foad 359a792f9b [AMDGPU] SILoadStoreOptimizer: avoid unbounded register pressure increases
Previously when combining two loads this pass would sink the
first one down to the second one, putting the combined load
where the second one was. It would also sink any intervening
instructions which depended on the first load down to just
after the combined load.

For example, if we started with this sequence of
instructions (code flowing from left to right):

  X A B C D E F Y

After combining loads X and Y into XY we might end up with:

  A B C D E F XY

But if B D and F depended on X, we would get:

  A C E XY B D F

Now if the original code had some short disjoint live ranges
from A to B, C to D and E to F, in the transformed code
these live ranges will be long and overlapping. In this way
a single merge of two loads could cause an unbounded
increase in register pressure.

To fix this, change the way the way that loads are moved in
order to merge them so that:
- The second load is moved up to the first one. (But when
  merging stores, we still move the first store down to the
  second one.)
- Intervening instructions are never moved.
- Instead, if we find an intervening instruction that would
  need to be moved, give up on the merge. But this case
  should now be pretty rare because normal stores have no
  outputs, and normal loads only have address register
  inputs, but these will be identical for any pair of loads
  that we try to merge.

As well as fixing the unbounded register pressure increase
problem, moving loads up and stores down seems like it
should usually be a win for memory latency reasons.

Differential Revision: https://reviews.llvm.org/D119006
2022-02-21 10:51:14 +00:00

86 lines
4.0 KiB
LLVM

; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
@c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
; FIXME: Should combine the DS instructions into ds_write2 and ds_read2. This
; does not happen because when SILoadStoreOptimizer is run, the reads and writes
; are not adjacent. They are only moved later by MachineScheduler.
; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
; GCN: ds_write_b32
; GCN: ds_write_b32
; GCN: ds_read_b32
; GCN: ds_read_b32
; CHECK-LABEL: @no_clobber_ds_load_stores_x2
; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3
; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !0
; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0
define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) {
bb:
store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
%gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
%val.a = load i32, i32 addrspace(3)* %gep.a, align 4
store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
%gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
%val.b = load i32, i32 addrspace(3)* %gep.b, align 4
%val = add i32 %val.a, %val.b
store i32 %val, i32 addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_write_b32
; GCN-DAG: ds_read_b32
; GCN-DAG: ds_read_b32
; GCN-DAG: ds_read_b32
; CHECK-LABEL: @no_clobber_ds_load_stores_x3
; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !5, !noalias !8
; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8
; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !11, !noalias !12
; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12
; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !13, !noalias !14
; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14
define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) {
bb:
store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
%gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
%val.a = load i32, i32 addrspace(3)* %gep.a, align 4
store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
%gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
%val.b = load i32, i32 addrspace(3)* %gep.b, align 4
store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4
%gep.c = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 %i
%val.c = load i32, i32 addrspace(3)* %gep.c, align 4
%val.1 = add i32 %val.a, %val.b
%val = add i32 %val.1, %val.c
store i32 %val, i32 addrspace(1)* %arg, align 4
ret void
}
; CHECK: !0 = !{!1}
; CHECK: !1 = distinct !{!1, !2}
; CHECK: !2 = distinct !{!2}
; CHECK: !3 = !{!4}
; CHECK: !4 = distinct !{!4, !2}
; CHECK: !5 = !{!6}
; CHECK: !6 = distinct !{!6, !7}
; CHECK: !7 = distinct !{!7}
; CHECK: !8 = !{!9, !10}
; CHECK: !9 = distinct !{!9, !7}
; CHECK: !10 = distinct !{!10, !7}
; CHECK: !11 = !{!9}
; CHECK: !12 = !{!6, !10}
; CHECK: !13 = !{!10}
; CHECK: !14 = !{!6, !9}