Files
clang-p2996/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll
spupyrev cc2fbc648d [CodeLayout] Faster basic block reordering, ext-tsp (#68617)
Aggressive inlining might produce huge functions with >10K of basic 
blocks. Since BFI treats _all_ blocks and jumps as "hot" having 
non-negative (but perhaps small) weight, the current implementation can
be slow, taking minutes to produce an layout. This change introduces a
few modifications that significantly (up to 50x on some instances) 
speeds up the computation. Some notable changes:
- reduced the maximum chain size to 512 (from the prior 4096);
- introduced MaxMergeDensityRatio param to avoid merging chains with
very different densities;
- dropped a couple of params that seem unnecessary.

Looking at some "offline" metrics (e.g., the number of created 
fall-throughs), there shouldn't be problems; in fact, I do see some
metrics go up. But it might be hard/impossible to measure perf 
difference for such small changes. I did test the performance clang-14 
binary and do not record a perf or i-cache-related differences.

My 5 benchmarks, with ext-tsp runtime (the lower the better) and 
"tsp-score" (the higher the better).
**Before**:

- benchmark 1:
  num functions: 13,047
  reordering running time is 2.4 seconds
  score: 125503458 (128.3102%)
- benchmark 2:
  num functions: 16,438
  reordering running time is 3.4 seconds
  score: 12613997277 (129.7495%)
- benchmark 3:
  num functions: 12,359
  reordering running time is 1.9 seconds
  score: 1315881613 (105.8991%)
- benchmark 4:
  num functions: 96,588
  reordering running time is 7.3 seconds
  score: 89513906284 (100.3413%)
- benchmark 5:
  num functions: 1
  reordering running time is 372 seconds
  score: 21292505965077 (99.9979%)
- benchmark 6:
  num functions:  71,155
  reordering running time is 314 seconds
  score: 29795381626270671437824 (102.7519%)

**After**:
- benchmark 1:
  reordering running time is 2.2 seconds
  score: 125510418 (128.3130%)

- benchmark 2:
  reordering running time is 2.6 seconds
  score: 12614502162 (129.7525%)

- benchmark 3:
  reordering running time is 1.6 seconds
  score: 1315938168 (105.9024%)

- benchmark 4:
  reordering running time is 4.9 seconds
  score: 89518095837 (100.3454%)

- benchmark 5:
  reordering running time is 4.8 seconds
  score: 21292295939119 (99.9971%)

- benchmark 6:
  reordering running time is 104 seconds
  score: 29796710925310302879744 (102.7565%)
2023-10-25 07:52:26 -07:00

403 lines
7.7 KiB
LLVM

;; See also llvm/unittests/Transforms/Utils/CodeLayoutTest.cpp
; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 < %s | FileCheck %s
define void @func1a() {
; Test that the algorithm positions the most likely successor first
;
; +-----+
; | b0 | -+
; +-----+ |
; | |
; | 40 |
; v |
; +-----+ |
; | b1 | | 100
; +-----+ |
; | |
; | 40 |
; v |
; +-----+ |
; | b2 | <+
; +-----+
;
; CHECK-LABEL: func1a:
; CHECK: b0
; CHECK: b2
; CHECK: b1
b0:
%call = call zeroext i1 @a()
br i1 %call, label %b1, label %b2, !prof !1
b1:
call void @d()
call void @d()
call void @d()
br label %b2
b2:
call void @e()
ret void
}
define void @func1b() {
; Test that the algorithm prefers many fallthroughs even in the presence of
; a heavy successor
;
; +-----+
; | b0 | -+
; +-----+ |
; | |
; | 80 |
; v |
; +-----+ |
; | b1 | | 100
; +-----+ |
; | |
; | 80 |
; v |
; +-----+ |
; | b2 | <+
; +-----+
;
; CHECK-LABEL: func1b:
; CHECK: b0
; CHECK: b1
; CHECK: b2
b0:
%call = call zeroext i1 @a()
br i1 %call, label %b1, label %b2, !prof !2
b1:
call void @d()
call void @d()
call void @d()
br label %b2
b2:
call void @e()
ret void
}
define void @func2() !prof !3 {
; Test that the algorithm positions the hot chain continuously
;
; +----+ [7] +-------+
; | b1 | <----- | b0 |
; +----+ +-------+
; | |
; | | [15]
; | v
; | +-------+
; | | b3 |
; | +-------+
; | |
; | | [15]
; | v
; | +-------+ [31]
; | | | -------+
; | | b4 | |
; | | | <------+
; | +-------+
; | |
; | | [15]
; | v
; | [7] +-------+
; +---------> | b2 |
; +-------+
;
; CHECK-LABEL: func2:
; CHECK: b0
; CHECK: b3
; CHECK: b4
; CHECK: b2
; CHECK: b1
b0:
call void @d()
call void @d()
call void @d()
%call = call zeroext i1 @a()
br i1 %call, label %b1, label %b3, !prof !4
b1:
call void @d()
br label %b2
b2:
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
ret void
b3:
call void @d()
br label %b4
b4:
call void @d()
%call2 = call zeroext i1 @a()
br i1 %call2, label %b2, label %b4, !prof !5
}
define void @func3() !prof !6 {
; A larger test where it is beneficial for locality to break the loop
;
; +--------+
; | b0 |
; +--------+
; |
; | [177]
; v
; +----+ [177] +---------------------------+
; | b5 | <------- | b1 |
; +----+ +---------------------------+
; | ^ ^
; | [196] | [124] | [70]
; v | |
; +----+ [70] +--------+ | |
; | b4 | <------- | b2 | | |
; +----+ +--------+ | |
; | | | |
; | | [124] | |
; | v | |
; | +--------+ | |
; | | b3 | -+ |
; | +--------+ |
; | |
; +-----------------------------------+
;
; CHECK-LABEL: func3:
; CHECK: b0
; CHECK: b1
; CHECK: b2
; CHECK: b3
; CHECK: b5
; CHECK: b4
b0:
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
call void @f()
br label %b1
b1:
%call = call zeroext i1 @a()
br i1 %call, label %b5, label %b2, !prof !7
b2:
call void @d()
call void @d()
call void @d()
call void @d()
%call2 = call zeroext i1 @a()
br i1 %call2, label %b3, label %b4, !prof !8
b3:
call void @d()
call void @f()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
call void @d()
br label %b1
b4:
call void @d()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
call void @e()
br label %b1
b5:
ret void
}
define void @func_loop() !prof !9 {
; Test that the algorithm can rotate loops in the presence of profile data.
;
; +--------+
; | entry |
; +--------+
; |
; | 1
; v
; +--------+ 16 +--------+
; | if.then| <---- | header | <+
; +--------+ +--------+ |
; | | |
; | | 16 |
; | v |
; | +--------+ |
; | | if.else| | 31
; | +--------+ |
; | | |
; | | 16 |
; | v |
; | 16 +--------+ |
; +------------> | if.end | -+
; +--------+
; |
; | 1
; v
; +--------+
; | end |
; +--------+
;
; CHECK-LABEL: func_loop:
; CHECK: if.else
; CHECK: if.end
; CHECK: header
; CHECK: if.then
entry:
br label %header
header:
call void @e()
%call = call zeroext i1 @a()
br i1 %call, label %if.then, label %if.else, !prof !10
if.then:
call void @f()
br label %if.end
if.else:
call void @g()
br label %if.end
if.end:
call void @h()
%call2 = call zeroext i1 @a()
br i1 %call2, label %header, label %end
end:
ret void
}
define void @func4() !prof !11 {
; Test verifying that chains can be split in order to improve the objective
; by creating more fallthroughs
;
; +-------+
; | entry |--------+
; +-------+ |
; | |
; | 27 |
; v |
; +-------+ |
; | b1 | -+ |
; +-------+ | |
; | | |
; | 10 | | 0
; v | |
; +-------+ | |
; | b3 | | 17 |
; +-------+ | |
; | | |
; | 10 | |
; v | |
; +-------+ | |
; | b2 | <+ ----+
; +-------+
;
; CHECK-LABEL: func4:
; CHECK: entry
; CHECK: b1
; CHECK: b3
; CHECK: b2
entry:
call void @b()
%call2 = call zeroext i1 @a()
br i1 %call2, label %b1, label %b2, !prof !12
b1:
call void @c()
%call = call zeroext i1 @a()
br i1 %call, label %b2, label %b3, !prof !13
b2:
call void @d()
ret void
b3:
call void @e()
br label %b2
}
declare zeroext i1 @a()
declare void @b()
declare void @c()
declare void @d()
declare void @e()
declare void @g()
declare void @f()
declare void @h()
!1 = !{!"branch_weights", i32 40, i32 100}
!2 = !{!"branch_weights", i32 80, i32 100}
!3 = !{!"function_entry_count", i64 2200}
!4 = !{!"branch_weights", i32 700, i32 1500}
!5 = !{!"branch_weights", i32 1500, i32 3100}
!6 = !{!"function_entry_count", i64 177}
!7 = !{!"branch_weights", i32 177, i32 196}
!8 = !{!"branch_weights", i32 125, i32 70}
!9 = !{!"function_entry_count", i64 1}
!10 = !{!"branch_weights", i32 16, i32 16}
!11 = !{!"function_entry_count", i64 1}
!12 = !{!"branch_weights", i32 27, i32 0}
!13 = !{!"branch_weights", i32 17, i32 10}