This commit skips the expansion of the `vector.reduce.add` intrinsic on
vector-enabled SystemZ targets in order to introduce custom handling of
`vector.reduce.add` for legal vector types using the VSUM instructions.
This is limited to full vectors with scalar types up to `i32` due to
performance concerns.
It also adds testing for the generation of such custom handling, and
adapts the related cost computation, as well as the testing for that.
The expected result is a performance boost in certain benchmarks that
make heavy use of `vector.reduce.add` with other benchmarks remaining
constant.
For instance, the assembly for `vector.reduce.add<4 x i32>` changes from
```hlasm
vmrlg %v0, %v24, %v24
vaf %v0, %v24, %v0
vrepf %v1, %v0, 1
vaf %v0, %v0, %v1
vlgvf %r2, %v0, 0
```
to
```hlasm
vgbm %v0, 0
vsumqf %v0, %v24, %v0
vlgvf %r2, %v0, 3
```
290 lines
8.6 KiB
LLVM
290 lines
8.6 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; Test vector add reduction instrinsic
|
|
;
|
|
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
|
|
|
|
; 1 vector length
|
|
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
|
|
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
|
|
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
|
|
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
|
|
declare i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
|
|
; 2 vector lengths
|
|
declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a)
|
|
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a)
|
|
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
|
|
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a)
|
|
declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
|
|
; ; TODO
|
|
; ; 4 vector lengths
|
|
declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a)
|
|
declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a)
|
|
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
|
|
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a)
|
|
declare i128 @llvm.vector.reduce.add.v4i128(<4 x i128> %a)
|
|
; ; Subvector lengths
|
|
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
|
|
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
|
|
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
|
|
declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
|
|
|
|
define i8 @f1_1(<16 x i8> %a) {
|
|
; CHECK-LABEL: f1_1:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vgbm %v0, 0
|
|
; CHECK-NEXT: vsumb %v1, %v24, %v0
|
|
; CHECK-NEXT: vsumqf %v0, %v1, %v0
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
|
|
ret i8 %redadd
|
|
}
|
|
|
|
define i16 @f1_2(<8 x i16> %a) {
|
|
; CHECK-LABEL: f1_2:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vgbm %v0, 0
|
|
; CHECK-NEXT: vsumh %v1, %v24, %v0
|
|
; CHECK-NEXT: vsumqf %v0, %v1, %v0
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
|
|
ret i16 %redadd
|
|
}
|
|
|
|
define i32 @f1_3(<4 x i32> %a) {
|
|
; CHECK-LABEL: f1_3:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vgbm %v0, 0
|
|
; CHECK-NEXT: vsumqf %v0, %v24, %v0
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
|
|
%redadd = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
|
|
ret i32 %redadd
|
|
}
|
|
|
|
define i64 @f1_4(<2 x i64> %a) {
|
|
; CHECK-LABEL: f1_4:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vrepg %v0, %v24, 1
|
|
; CHECK-NEXT: vag %v0, %v24, %v0
|
|
; CHECK-NEXT: vlgvg %r2, %v0, 0
|
|
; CHECK-NEXT: br %r14
|
|
|
|
%redadd = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
|
|
ret i64 %redadd
|
|
}
|
|
|
|
define i128 @f1_5(<1 x i128> %a) {
|
|
; CHECK-LABEL: f1_5:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vst %v24, 0(%r2), 3
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
|
|
ret i128 %redadd
|
|
}
|
|
|
|
define i8 @f2_1(<32 x i8> %a) {
|
|
; CHECK-LABEL: f2_1:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vab %v0, %v24, %v26
|
|
; CHECK-NEXT: vgbm %v1, 0
|
|
; CHECK-NEXT: vsumb %v0, %v0, %v1
|
|
; CHECK-NEXT: vsumqf %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a)
|
|
ret i8 %redadd
|
|
}
|
|
|
|
define i16 @f2_2(<16 x i16> %a) {
|
|
; CHECK-LABEL: f2_2:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vah %v0, %v24, %v26
|
|
; CHECK-NEXT: vgbm %v1, 0
|
|
; CHECK-NEXT: vsumh %v0, %v0, %v1
|
|
; CHECK-NEXT: vsumqf %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a)
|
|
ret i16 %redadd
|
|
}
|
|
|
|
define i32 @f2_3(<8 x i32> %a) {
|
|
; CHECK-LABEL: f2_3:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vaf %v0, %v24, %v26
|
|
; CHECK-NEXT: vgbm %v1, 0
|
|
; CHECK-NEXT: vsumqf %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
|
|
%redadd = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
|
|
ret i32 %redadd
|
|
}
|
|
|
|
define i64 @f2_4(<4 x i64> %a) {
|
|
; CHECK-LABEL: f2_4:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vag %v0, %v24, %v26
|
|
; CHECK-NEXT: vrepg %v1, %v0, 1
|
|
; CHECK-NEXT: vag %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvg %r2, %v0, 0
|
|
; CHECK-NEXT: br %r14
|
|
|
|
%redadd = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a)
|
|
ret i64 %redadd
|
|
}
|
|
|
|
define i128 @f2_5(<2 x i128> %a) {
|
|
; CHECK-LABEL: f2_5:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vl %v0, 16(%r3), 3
|
|
; CHECK-NEXT: vl %v1, 0(%r3), 3
|
|
; CHECK-NEXT: vaq %v0, %v1, %v0
|
|
; CHECK-NEXT: vst %v0, 0(%r2), 3
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
|
|
ret i128 %redadd
|
|
}
|
|
|
|
define i8 @f3_1(<64 x i8> %a) {
|
|
; CHECK-LABEL: f3_1:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vab %v0, %v26, %v30
|
|
; CHECK-NEXT: vab %v1, %v24, %v28
|
|
; CHECK-NEXT: vab %v0, %v1, %v0
|
|
; CHECK-NEXT: vgbm %v1, 0
|
|
; CHECK-NEXT: vsumb %v0, %v0, %v1
|
|
; CHECK-NEXT: vsumqf %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a)
|
|
ret i8 %redadd
|
|
}
|
|
|
|
define i16 @f3_2(<32 x i16> %a) {
|
|
; CHECK-LABEL: f3_2:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vah %v0, %v26, %v30
|
|
; CHECK-NEXT: vah %v1, %v24, %v28
|
|
; CHECK-NEXT: vah %v0, %v1, %v0
|
|
; CHECK-NEXT: vgbm %v1, 0
|
|
; CHECK-NEXT: vsumh %v0, %v0, %v1
|
|
; CHECK-NEXT: vsumqf %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a)
|
|
ret i16 %redadd
|
|
}
|
|
|
|
define i32 @f3_3(<16 x i32> %a) {
|
|
; CHECK-LABEL: f3_3:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vaf %v0, %v26, %v30
|
|
; CHECK-NEXT: vaf %v1, %v24, %v28
|
|
; CHECK-NEXT: vaf %v0, %v1, %v0
|
|
; CHECK-NEXT: vgbm %v1, 0
|
|
; CHECK-NEXT: vsumqf %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 3
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
|
|
%redadd = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
|
|
ret i32 %redadd
|
|
}
|
|
|
|
define i64 @f3_4(<8 x i64> %a) {
|
|
; CHECK-LABEL: f3_4:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vag %v0, %v26, %v30
|
|
; CHECK-NEXT: vag %v1, %v24, %v28
|
|
; CHECK-NEXT: vag %v0, %v1, %v0
|
|
; CHECK-NEXT: vrepg %v1, %v0, 1
|
|
; CHECK-NEXT: vag %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvg %r2, %v0, 0
|
|
; CHECK-NEXT: br %r14
|
|
|
|
%redadd = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a)
|
|
ret i64 %redadd
|
|
}
|
|
|
|
define i128 @f3_5(<4 x i128> %a) {
|
|
; CHECK-LABEL: f3_5:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vl %v0, 32(%r3), 3
|
|
; CHECK-NEXT: vl %v1, 0(%r3), 3
|
|
; CHECK-NEXT: vl %v2, 48(%r3), 3
|
|
; CHECK-NEXT: vl %v3, 16(%r3), 3
|
|
; CHECK-NEXT: vaq %v2, %v3, %v2
|
|
; CHECK-NEXT: vaq %v0, %v1, %v0
|
|
; CHECK-NEXT: vaq %v0, %v0, %v2
|
|
; CHECK-NEXT: vst %v0, 0(%r2), 3
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i128 @llvm.vector.reduce.add.v4i128(<4 x i128> %a)
|
|
ret i128 %redadd
|
|
}
|
|
|
|
|
|
define i8 @f4_1(<8 x i8> %a) {
|
|
; CHECK-LABEL: f4_1:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vpkg %v0, %v24, %v24
|
|
; CHECK-NEXT: vab %v0, %v24, %v0
|
|
; CHECK-NEXT: vpkf %v1, %v0, %v0
|
|
; CHECK-NEXT: vab %v0, %v0, %v1
|
|
; CHECK-NEXT: vrepb %v1, %v0, 1
|
|
; CHECK-NEXT: vab %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvb %r2, %v0, 0
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
|
|
ret i8 %redadd
|
|
}
|
|
|
|
define i16 @f4_2(<4 x i16> %a) {
|
|
; CHECK-LABEL: f4_2:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vpkg %v0, %v24, %v24
|
|
; CHECK-NEXT: vah %v0, %v24, %v0
|
|
; CHECK-NEXT: vreph %v1, %v0, 1
|
|
; CHECK-NEXT: vah %v0, %v0, %v1
|
|
; CHECK-NEXT: vlgvh %r2, %v0, 0
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
%redadd = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
|
|
ret i16 %redadd
|
|
}
|
|
|
|
define i32 @f4_3(<2 x i32> %a) {
|
|
; CHECK-LABEL: f4_3:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vrepf %v0, %v24, 1
|
|
; CHECK-NEXT: vaf %v0, %v24, %v0
|
|
; CHECK-NEXT: vlgvf %r2, %v0, 0
|
|
; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
|
|
; CHECK-NEXT: br %r14
|
|
|
|
%redadd = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
|
|
ret i32 %redadd
|
|
}
|
|
|
|
define i64 @f4_4(<1 x i64> %a) {
|
|
; CHECK-LABEL: f4_4:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vlgvg %r2, %v24, 0
|
|
; CHECK-NEXT: br %r14
|
|
|
|
%redadd = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
|
|
ret i64 %redadd
|
|
}
|