Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
Alexey Bataev 10085390c6 [SLP]Reduce number of alternate instruction, where possible
Previous version was reviewed here https://github.com/llvm/llvm-project/pull/123360
It is mostly the same, adjusted after graph-to-tree transformation

Patch tries to remove wide alternate operations.
Currently SLP vectorizer emits something like this:
```
%0 = add i32
%1 = sub i32
%2 = add i32
%3 = sub i32
%4 = add i32
%5 = sub i32
%6 = add i32
%7 = sub i32

transformes to

%v1 = add <8 x i32>
%v2 = sub <8 x i32>
%res = shuffle %v1, %v2, <0, 9, 2, 11, 4, 13, 6, 15>
```
i.e. half of the results are just unused. This leads to increased
register pressure and potentially doubles number of operations.

Patch introduces SplitVectorize mode, where it splits the operations by
opcodes and produces instead something like this:
```
%v1 = add <4 x i32>
%v2 = sub <4 x i32>
%res = shuffle %v1, %v2, <0, 4, 1, 5, 2, 6, 3, 7>
```
It allows to improve the performance by reducing number of ops. Also, it
turns on some other improvements, like improved graph reordering.

-O3+LTO, AVX512
Metric: size..text
Program                                                                         size..text
                                                                                results     results0    diff
                       test-suite :: MultiSource/Benchmarks/Olden/tsp/tsp.test     2788.00     2820.00  1.1%
test-suite :: MultiSource/Benchmarks/Prolangs-C/TimberWolfMC/timberwolfmc.test   278168.00   280904.00  1.0%
               test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test    82682.00    83258.00  0.7%
                    test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test   139344.00   139712.00  0.3%
     test-suite :: MultiSource/Benchmarks/FreeBench/fourinarow/fourinarow.test    27149.00    27197.00  0.2%
               test-suite :: MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4.test  1008188.00  1009948.00  0.2%
          test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test    39226.00    39290.00  0.2%
   test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test    39229.00    39293.00  0.2%
 test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test  2074533.00  2076549.00  0.1%
test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test  2074533.00  2076549.00  0.1%
             test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test   798440.00   798952.00  0.1%
     test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test    44123.00    44139.00  0.0%
                       test-suite :: MultiSource/Benchmarks/Bullet/bullet.test   318942.00   319038.00  0.0%
        test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  1159880.00  1160152.00  0.0%
     test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniAMR/miniAMR.test    73595.00    73611.00  0.0%
                test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test  1146124.00  1146348.00  0.0%
       test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/CLAMR.test   203831.00   203847.00  0.0%
 test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test   207662.00   207678.00  0.0%
                test-suite :: External/SPEC/CFP2006/447.dealII/447.dealII.test   589851.00   589883.00  0.0%
      test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test  1398543.00  1398559.00  0.0%
     test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test  1398543.00  1398559.00  0.0%
        test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test  2050990.00  2051006.00  0.0%

      test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12559687.00 12559591.00 -0.0%
                     test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test  3074157.00  3074125.00 -0.0%
         test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test  1092252.00  1092188.00 -0.0%
            test-suite :: External/SPEC/CFP2017rate/508.namd_r/508.namd_r.test   779763.00   779715.00 -0.0%
         test-suite :: MultiSource/Benchmarks/ASCI_Purple/SMG2000/smg2000.test   253517.00   253485.00 -0.0%
                  test-suite :: MultiSource/Applications/JM/lencod/lencod.test   848259.00   848035.00 -0.0%
     test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test    93064.00    93016.00 -0.1%
                  test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test   383747.00   383475.00 -0.1%
          test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   673051.00   662907.00 -1.5%
           test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   673051.00   662907.00 -1.5%

Olden/tsp - small variations
Prolangs-C/TimberWolfMC - small variations, some code not inlined
FreeBench/pifft - extra store <8 x double> vectorized, some other extra
vectorizations
CFP2006/433.milc - better vector code
FreeBench/fourinarow - better vector code
Benchmarks/tramp3d-v4 - extra vector code, small variations
mediabench/gsm/toast - small variations
MiBench/telecomm-gsm - small variations
CINT2017rate/500.perlbench_r
CINT2017speed/600.perlbench_s - better vector code, small variations
CINT2006/464.h264ref - some smaller code + changes similar to x264
DOE-ProxyApps-C/miniGMG - small variations
Benchmarks/Bullet - small variations
CFP2017rate/511.povray_r - small variations
DOE-ProxyApps-C/miniAMR - small variations
CFP2006/453.povray - small variations
DOE-ProxyApps-C++/CLAMR - small variations
MiBench/consumer-lame - small variations
CFP2006/447.dealII - small variations
CFP2017rate/538.imagick_r
CFP2017speed/638.imagick_s - small variations
CFP2017rate/510.parest_r - better vector code, small variations
CFP2017rate/526.blender_r - small variations
CINT2006/403.gcc - small variations
CINT2006/400.perlbench - small variations
CFP2017rate/508.namd_r - small variations
ASCI_Purple/SMG2000 - small variations
JM/lencod - extra store <16 x i32>, small variations
DOE-ProxyApps-C++/miniFE - small variations
JM/ldecod - extra vector code, small variations, less shuffles
CINT2017speed/625.x264_s
CINT2017rate/525.x264_r - the number of instructions increased, but
looks like they are more performant. E.g., for function
x264_pixel_satd_8x8, llvm-mca reports better throughput - 84 for the
current version and 59 for the new version.

-O3+LTO, mcpu=sifive-p470

Metric: size..text

                                                                               results    results0   diff
                                 test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test  580768.00  581118.00   0.1%
                                        test-suite :: MultiSource/Applications/d/make_dparser.test   78854.00   78894.00   0.1%
                                      test-suite :: MultiSource/Applications/JM/lencod/lencod.test  633448.00  633750.00   0.0%
                                           test-suite :: MultiSource/Benchmarks/Bullet/bullet.test  277002.00  277080.00   0.0%
                             test-suite :: External/SPEC/CINT2006/400.perlbench/400.perlbench.test  931938.00  931960.00   0.0%
                                         test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test 2512806.00 2512822.00   0.0%
                                test-suite :: External/SPEC/CINT2017speed/602.gcc_s/602.gcc_s.test 7659880.00 7659876.00  -0.0%
                                 test-suite :: External/SPEC/CINT2017rate/502.gcc_r/502.gcc_r.test 7659880.00 7659876.00  -0.0%
                            test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 1602448.00 1602434.00  -0.0%
                          test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 9496664.00 9496542.00  -0.0%
                     test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test  147424.00  147422.00  -0.0%
                    test-suite :: External/SPEC/CINT2017speed/600.perlbench_s/600.perlbench_s.test 1764608.00 1764578.00  -0.0%
                     test-suite :: External/SPEC/CINT2017rate/500.perlbench_r/500.perlbench_r.test 1764608.00 1764578.00  -0.0%
                                     test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  841656.00  841632.00  -0.0%
                                    test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test  949026.00  948962.00  -0.0%
                            test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  946348.00  946284.00  -0.0%
                                      test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test  279794.00  279764.00  -0.0%
                       test-suite :: MultiSource/Benchmarks/mediabench/g721/g721encode/encode.test    4776.00    4772.00  -0.1%
                              test-suite :: MultiSource/Benchmarks/mediabench/gsm/toast/toast.test   25074.00   25028.00  -0.2%
                       test-suite :: MultiSource/Benchmarks/MiBench/telecomm-gsm/telecomm-gsm.test   25074.00   25028.00  -0.2%
                         test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C/miniGMG/miniGMG.test   29336.00   29184.00  -0.5%
                               test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test  535390.00  510124.00  -4.7%
                              test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test  535390.00  510124.00  -4.7%
test-suite :: SingleSource/Regression/C/gcc-c-torture/execute/ieee/GCC-C-execute-ieee-pr50310.test     886.00     608.00 -31.4%

CINT2006/464.h264ref - extra v16i32 reduction
d/make_dparser - better vector code
JM/lencod - extra v16i32 reduction
Benchmarks/Bullet - smaller vector code
CINT2006/400.perlbench - better vector code
CINT2006/403.gcc - small variations
CINT2017speed/602.gcc_s
CINT2017rate/502.gcc_r - small variations
CFP2017rate/510.parest_r - small variations
CFP2017rate/526.blender_r - small variations
MiBench/consumer-lame - small variations
CINT2017speed/600.perlbench_s
CINT2017rate/500.perlbench_r - small variations
Benchmarks/7zip - small variations
CFP2017rate/511.povray_r - small variations
JM/ldecod - extra vector code
mediabench/g721/g721encode - extra vector code
mediabench/gsm - extra vector code
MiBench/telecomm-gsm - extra vector code
DOE-ProxyApps-C/miniGMG - extra vector code
CINT2017rate/525.x264_r
CINT2017speed/625.x264_s - reduced number of wide operations and
shuffles, saving the registers, similar to X86, extra code in
pixel_hadamard_ac vectorized
ieee/GCC-C-execute-ieee-pr50310 - extra code vectorized

CINT2006/464.h264ref - extra vector code in find_sad_16x16
JM/lencod - extra vector code in find_sad_16x16
d/make_dparser - smaller vector code
Benchmarks/Bullet - small variations
CINT2006/400.perlbench - smaller vector code
CFP2017rate/526.blender_r - small variations, extra store <8 x float> in
the loop, extra store <8 x i8> in loop
CINT2017rate/500.perlbench_r
CINT2017speed/600.perlbench_s - small variations
MiBench/consumer-lame - small variations
JM/ldecod - extra vector code
mediabench/g721/g721encode - small variations

Reviewers: hiraditya

Reviewed By: hiraditya

Pull Request: https://github.com/llvm/llvm-project/pull/128907
2025-03-12 08:18:51 -07:00

223 lines
11 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
define void @test(i32 %0, i8 %1, i64 %2, float %3) {
; CHECK-LABEL: define void @test(
; CHECK-SAME: i32 [[TMP0:%.*]], i8 [[TMP1:%.*]], i64 [[TMP2:%.*]], float [[TMP3:%.*]]) {
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 255, i64 -65536>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> <i64 1, i64 poison>, <2 x i32> <i32 2, i32 0>
; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP9]], <i64 1, i64 16>
; CHECK-NEXT: [[TMP11:%.*]] = trunc <2 x i64> [[TMP10]] to <2 x i8>
; CHECK-NEXT: [[TMP12:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[TMP11]], <2 x i8> zeroinitializer)
; CHECK-NEXT: [[TMP13:%.*]] = uitofp <2 x i8> [[TMP12]] to <2 x float>
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP16:%.*]] = uitofp <4 x i8> [[TMP15]] to <4 x float>
; CHECK-NEXT: [[TMP17:%.*]] = fdiv <2 x float> [[TMP13]], zeroinitializer
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP18]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP20:%.*]] = sitofp <2 x i32> [[TMP19]] to <2 x float>
; CHECK-NEXT: [[TMP21:%.*]] = fdiv <2 x float> zeroinitializer, [[TMP20]]
; CHECK-NEXT: [[TMP22:%.*]] = trunc <2 x i64> [[TMP7]] to <2 x i32>
; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> zeroinitializer, [[TMP22]]
; CHECK-NEXT: [[TMP24:%.*]] = ashr <2 x i32> [[TMP23]], splat (i32 1)
; CHECK-NEXT: [[TMP25:%.*]] = sitofp <2 x i32> [[TMP24]] to <2 x float>
; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP28:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP27]], <4 x float> [[TMP16]], i64 4)
; CHECK-NEXT: [[TMP29:%.*]] = fdiv <8 x float> zeroinitializer, [[TMP28]]
; CHECK-NEXT: [[TMP30:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP29]])
; CHECK-NEXT: [[TMP31:%.*]] = bitcast <8 x float> [[TMP30]] to <8 x i32>
; CHECK-NEXT: [[TMP32:%.*]] = icmp ult <8 x i32> [[TMP31]], splat (i32 1325400064)
; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i1> [[TMP32]], i32 6
; CHECK-NEXT: [[TMP34:%.*]] = select i1 [[TMP33]], i64 0, i64 2147483648
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i1> [[TMP32]], i32 5
; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 4286578688
; CHECK-NEXT: [[TMP37:%.*]] = or i64 [[TMP34]], [[TMP36]]
; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i1> [[TMP32]], i32 7
; CHECK-NEXT: [[TMP39:%.*]] = select i1 [[TMP38]], i64 0, i64 128
; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i1> [[TMP32]], i32 4
; CHECK-NEXT: [[TMP41:%.*]] = select i1 [[TMP40]], i64 0, i64 128
; CHECK-NEXT: [[TMP42:%.*]] = extractelement <8 x i1> [[TMP32]], i32 2
; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i64 0, i64 8388608
; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i1> [[TMP32]], i32 3
; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 32768
; CHECK-NEXT: [[TMP46:%.*]] = or i64 [[TMP43]], [[TMP45]]
; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i1> [[TMP32]], i32 0
; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i64 0, i64 8388608
; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i1> [[TMP32]], i32 1
; CHECK-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i64 0, i64 32768
; CHECK-NEXT: br label %[[BB52:.*]]
; CHECK: [[BB51:.*]]:
; CHECK-NEXT: unreachable
; CHECK: [[BB52]]:
; CHECK-NEXT: br label %[[BB53:.*]]
; CHECK: [[BB53]]:
; CHECK-NEXT: [[TMP54:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP17]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 0, ptr null)
; CHECK-NEXT: [[TMP55:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP21]])
; CHECK-NEXT: [[TMP56:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <8 x float> [[TMP56]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP58:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP57]], <2 x float> [[TMP55]], i64 0)
; CHECK-NEXT: [[TMP59:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP58]], <2 x float> [[TMP54]], i64 6)
; CHECK-NEXT: [[TMP60:%.*]] = bitcast <8 x float> [[TMP59]] to <8 x i32>
; CHECK-NEXT: [[TMP61:%.*]] = icmp ult <8 x i32> [[TMP60]], splat (i32 1325400064)
; CHECK-NEXT: [[TMP62:%.*]] = extractelement <8 x i1> [[TMP61]], i32 5
; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP62]], i64 [[TMP37]], i64 0
; CHECK-NEXT: [[TMP64:%.*]] = extractelement <8 x i1> [[TMP61]], i32 4
; CHECK-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i64 0, i64 4294967168
; CHECK-NEXT: [[TMP66:%.*]] = or i64 [[TMP63]], [[TMP65]]
; CHECK-NEXT: [[TMP67:%.*]] = extractelement <8 x i1> [[TMP61]], i32 7
; CHECK-NEXT: [[TMP68:%.*]] = select i1 [[TMP67]], i64 0, i64 8388608
; CHECK-NEXT: [[TMP69:%.*]] = extractelement <8 x i1> [[TMP61]], i32 6
; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP69]], i64 0, i64 32768
; CHECK-NEXT: [[TMP71:%.*]] = or i64 [[TMP68]], [[TMP70]]
; CHECK-NEXT: [[TMP72:%.*]] = or i64 [[TMP71]], [[TMP66]]
; CHECK-NEXT: [[TMP73:%.*]] = or i64 [[TMP72]], [[TMP39]]
; CHECK-NEXT: store i64 [[TMP73]], ptr null, align 1
; CHECK-NEXT: store i64 [[TMP41]], ptr null, align 1
; CHECK-NEXT: [[TMP74:%.*]] = extractelement <8 x i1> [[TMP61]], i32 3
; CHECK-NEXT: [[TMP75:%.*]] = select i1 [[TMP74]], i64 0, i64 -9223372036854775808
; CHECK-NEXT: [[TMP76:%.*]] = extractelement <8 x i1> [[TMP61]], i32 2
; CHECK-NEXT: [[TMP77:%.*]] = zext i1 [[TMP76]] to i64
; CHECK-NEXT: [[TMP78:%.*]] = or i64 [[TMP46]], [[TMP77]]
; CHECK-NEXT: [[TMP79:%.*]] = or i64 [[TMP78]], [[TMP75]]
; CHECK-NEXT: store i64 [[TMP79]], ptr null, align 1
; CHECK-NEXT: [[TMP80:%.*]] = extractelement <8 x i1> [[TMP61]], i32 1
; CHECK-NEXT: [[TMP81:%.*]] = select i1 [[TMP80]], i64 0, i64 2147483648
; CHECK-NEXT: [[TMP82:%.*]] = extractelement <8 x i1> [[TMP61]], i32 0
; CHECK-NEXT: [[TMP83:%.*]] = select i1 [[TMP82]], i64 0, i64 128
; CHECK-NEXT: [[TMP84:%.*]] = or i64 [[TMP83]], [[TMP50]]
; CHECK-NEXT: [[TMP85:%.*]] = or i64 [[TMP84]], [[TMP48]]
; CHECK-NEXT: [[TMP86:%.*]] = or i64 [[TMP85]], [[TMP81]]
; CHECK-NEXT: store i64 [[TMP86]], ptr null, align 1
; CHECK-NEXT: br label %[[BB51]]
;
%5 = and i64 %2, 255
%6 = and i64 %2, -65536
%7 = add i64 %5, 1
%8 = add i64 %2, %6
%9 = lshr i64 %7, 1
%10 = trunc i64 %9 to i8
%11 = tail call i8 @llvm.smax.i8(i8 %10, i8 0)
%12 = lshr i64 %8, 16
%13 = trunc i64 %12 to i8
%14 = tail call i8 @llvm.smax.i8(i8 %13, i8 0)
%15 = uitofp i8 %11 to float
%16 = uitofp i8 %14 to float
%17 = uitofp i8 %1 to float
%18 = uitofp i8 %1 to float
%19 = uitofp i8 %1 to float
%20 = fdiv float 0.000000e+00, %17
%21 = fdiv float %15, 0.000000e+00
%22 = fdiv float %16, 0.000000e+00
%23 = call float @llvm.fabs.f32(float %20)
%24 = bitcast float %23 to i32
%25 = icmp ult i32 %24, 1325400064
%26 = fdiv float 0.000000e+00, %18
%27 = fdiv float 0.000000e+00, %19
%28 = call float @llvm.fabs.f32(float %27)
%29 = bitcast float %28 to i32
%30 = icmp ult i32 %29, 1325400064
%31 = select i1 %30, i64 0, i64 2147483648
%32 = call float @llvm.fabs.f32(float %26)
%33 = bitcast float %32 to i32
%34 = icmp ult i32 %33, 1325400064
%35 = select i1 %34, i64 0, i64 4286578688
%36 = or i64 %31, %35
%37 = select i1 %25, i64 0, i64 128
%38 = fdiv float 0.000000e+00, %17
%39 = call float @llvm.fabs.f32(float %38)
%40 = bitcast float %39 to i32
%41 = icmp ult i32 %40, 1325400064
%42 = select i1 %41, i64 0, i64 128
%43 = trunc i64 %5 to i32
%44 = sub i32 0, %43
%45 = trunc i64 %6 to i32
%46 = sub i32 0, %45
%47 = ashr i32 %44, 1
%48 = ashr i32 %46, 1
%49 = sitofp i32 %0 to float
%50 = sitofp i32 %47 to float
%51 = sitofp i32 %48 to float
%52 = sitofp i32 %0 to float
%53 = fdiv float 0.000000e+00, %50
%54 = fdiv float 0.000000e+00, %51
%55 = call float @llvm.fabs.f32(float %53)
%56 = bitcast float %55 to i32
%57 = icmp ult i32 %56, 1325400064
%58 = call float @llvm.fabs.f32(float %54)
%59 = bitcast float %58 to i32
%60 = icmp ult i32 %59, 1325400064
%61 = select i1 %60, i64 0, i64 8388608
%62 = select i1 %57, i64 0, i64 32768
%63 = or i64 %61, %62
%64 = fdiv float 0.000000e+00, %49
%65 = fdiv float 0.000000e+00, %50
%66 = fdiv float 0.000000e+00, %51
%67 = fdiv float 0.000000e+00, %52
%68 = call float @llvm.fabs.f32(float %65)
%69 = bitcast float %68 to i32
%70 = icmp ult i32 %69, 1325400064
%71 = call float @llvm.fabs.f32(float %66)
%72 = bitcast float %71 to i32
%73 = icmp ult i32 %72, 1325400064
%74 = select i1 %73, i64 0, i64 8388608
%75 = select i1 %70, i64 0, i64 32768
br label %77
76: ; preds = %78
unreachable
77: ; preds = %4
br label %78
78: ; preds = %77
%79 = call float @llvm.fabs.f32(float %22)
%80 = bitcast float %79 to i32
%81 = icmp ult i32 %80, 1325400064
%82 = call float @llvm.fabs.f32(float %21)
%83 = bitcast float %82 to i32
%84 = icmp ult i32 %83, 1325400064
%85 = bitcast float %3 to i32
%86 = icmp ult i32 %85, 1325400064
%87 = select i1 %86, i64 %36, i64 0
%88 = bitcast float %3 to i32
%89 = icmp ult i32 %88, 1325400064
%90 = select i1 %89, i64 0, i64 4294967168
%91 = or i64 %87, %90
%92 = select i1 %81, i64 0, i64 8388608
%93 = select i1 %84, i64 0, i64 32768
%94 = or i64 %92, %93
%95 = or i64 %94, %91
%96 = or i64 %95, %37
store i64 %96, ptr null, align 1
call void @llvm.lifetime.start.p0(i64 0, ptr null)
store i64 %42, ptr null, align 1
%97 = bitcast float %3 to i32
%98 = icmp ult i32 %97, 1325400064
%99 = select i1 %98, i64 0, i64 -9223372036854775808
%100 = bitcast float %3 to i32
%101 = icmp ult i32 %100, 1325400064
%102 = zext i1 %101 to i64
%103 = or i64 %63, %102
%104 = or i64 %103, %99
store i64 %104, ptr null, align 1
%105 = call float @llvm.fabs.f32(float %67)
%106 = bitcast float %105 to i32
%107 = icmp ult i32 %106, 1325400064
%108 = call float @llvm.fabs.f32(float %64)
%109 = bitcast float %108 to i32
%110 = icmp ult i32 %109, 1325400064
%111 = select i1 %107, i64 0, i64 2147483648
%112 = select i1 %110, i64 0, i64 128
%113 = or i64 %112, %75
%114 = or i64 %113, %74
%115 = or i64 %114, %111
store i64 %115, ptr null, align 1
br label %76
}