Files
clang-p2996/llvm/test/CodeGen/PowerPC/funnel-shift.ll
Sanjay Patel c71adc8040 [Intrinsics] define funnel shift IR intrinsics + DAG builder support
As discussed here:
http://lists.llvm.org/pipermail/llvm-dev/2018-May/123292.html
http://lists.llvm.org/pipermail/llvm-dev/2018-July/124400.html

We want to add rotate intrinsics because the IR expansion of that pattern is 4+ instructions, 
and we can lose pieces of the pattern before it gets to the backend. Generalizing the operation 
by allowing 2 different input values (plus the 3rd shift/rotate amount) gives us a "funnel shift" 
operation which may also be a single hardware instruction.

Initially, I thought we needed to define new DAG nodes for these ops, and I spent time working 
on that (much larger patch), but then I concluded that we don't need it. At least as a first 
step, we have all of the backend support necessary to match these ops...because it was required. 
And shepherding these through the IR optimizer is the primary concern, so the IR intrinsics are 
likely all that we'll ever need.

There was also a question about converting the intrinsics to the existing ROTL/ROTR DAG nodes
(along with improving the oversized shift documentation). Again, I don't think that's strictly 
necessary (as the test results here prove). That can be an efficiency improvement as a small 
follow-up patch.

So all we're left with is documentation, definition of the IR intrinsics, and DAG builder support. 

Differential Revision: https://reviews.llvm.org/D49242

llvm-svn: 337221
2018-07-16 22:59:31 +00:00

272 lines
7.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=powerpc64le-- | FileCheck %s
declare i8 @llvm.fshl.i8(i8, i8, i8)
declare i16 @llvm.fshl.i16(i16, i16, i16)
declare i32 @llvm.fshl.i32(i32, i32, i32)
declare i64 @llvm.fshl.i64(i64, i64, i64)
declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
declare i8 @llvm.fshr.i8(i8, i8, i8)
declare i16 @llvm.fshr.i16(i16, i16, i16)
declare i32 @llvm.fshr.i32(i32, i32, i32)
declare i64 @llvm.fshr.i64(i64, i64, i64)
declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
; General case - all operands can be variables.
define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
; CHECK-LABEL: fshl_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: subfic 6, 5, 32
; CHECK-NEXT: andi. 5, 5, 31
; CHECK-NEXT: clrlwi 6, 6, 27
; CHECK-NEXT: slw 5, 3, 5
; CHECK-NEXT: srw 4, 4, 6
; CHECK-NEXT: or 4, 5, 4
; CHECK-NEXT: isel 3, 3, 4, 2
; CHECK-NEXT: blr
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
ret i32 %f
}
; Verify that weird types are minimally supported.
declare i37 @llvm.fshl.i37(i37, i37, i37)
define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-LABEL: fshl_i37:
; CHECK: # %bb.0:
; CHECK-NEXT: lis 6, -8857
; CHECK-NEXT: subfic 7, 5, 37
; CHECK-NEXT: clrldi 5, 5, 27
; CHECK-NEXT: clrldi 4, 4, 27
; CHECK-NEXT: ori 6, 6, 51366
; CHECK-NEXT: clrldi 7, 7, 27
; CHECK-NEXT: sldi 6, 6, 32
; CHECK-NEXT: oris 6, 6, 3542
; CHECK-NEXT: ori 6, 6, 31883
; CHECK-NEXT: mulhdu 8, 7, 6
; CHECK-NEXT: mulhdu 6, 5, 6
; CHECK-NEXT: rldicl 8, 8, 59, 5
; CHECK-NEXT: rldicl 6, 6, 59, 5
; CHECK-NEXT: mulli 8, 8, 37
; CHECK-NEXT: mulli 6, 6, 37
; CHECK-NEXT: sub 7, 7, 8
; CHECK-NEXT: subf. 5, 6, 5
; CHECK-NEXT: srd 4, 4, 7
; CHECK-NEXT: sld 5, 3, 5
; CHECK-NEXT: or 4, 5, 4
; CHECK-NEXT: isel 3, 3, 4, 2
; CHECK-NEXT: blr
%f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
}
; extract(concat(0b1110000, 0b1111111) << 2) = 0b1000011
declare i7 @llvm.fshl.i7(i7, i7, i7)
define i7 @fshl_i7_const_fold() {
; CHECK-LABEL: fshl_i7_const_fold:
; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 67
; CHECK-NEXT: blr
%f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
ret i7 %f
}
; With constant shift amount, this is rotate + insert (missing extended mnemonics).
define i32 @fshl_i32_const_shift(i32 %x, i32 %y) {
; CHECK-LABEL: fshl_i32_const_shift:
; CHECK: # %bb.0:
; CHECK-NEXT: rlwinm 4, 4, 9, 0, 31
; CHECK-NEXT: rlwimi 4, 3, 9, 0, 22
; CHECK-NEXT: mr 3, 4
; CHECK-NEXT: blr
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
ret i32 %f
}
; Check modulo math on shift amount.
define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) {
; CHECK-LABEL: fshl_i32_const_overshift:
; CHECK: # %bb.0:
; CHECK-NEXT: rlwinm 4, 4, 9, 0, 31
; CHECK-NEXT: rlwimi 4, 3, 9, 0, 22
; CHECK-NEXT: mr 3, 4
; CHECK-NEXT: blr
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
ret i32 %f
}
; 64-bit should also work.
define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) {
; CHECK-LABEL: fshl_i64_const_overshift:
; CHECK: # %bb.0:
; CHECK-NEXT: rotldi 4, 4, 41
; CHECK-NEXT: rldimi 4, 3, 41, 0
; CHECK-NEXT: mr 3, 4
; CHECK-NEXT: blr
%f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
ret i64 %f
}
; This should work without any node-specific logic.
define i8 @fshl_i8_const_fold() {
; CHECK-LABEL: fshl_i8_const_fold:
; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 128
; CHECK-NEXT: blr
%f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
ret i8 %f
}
; Repeat everything for funnel shift right.
; General case - all operands can be variables.
define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
; CHECK-LABEL: fshr_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: subfic 6, 5, 32
; CHECK-NEXT: andi. 5, 5, 31
; CHECK-NEXT: clrlwi 6, 6, 27
; CHECK-NEXT: srw 5, 4, 5
; CHECK-NEXT: slw 3, 3, 6
; CHECK-NEXT: or 3, 3, 5
; CHECK-NEXT: isel 3, 4, 3, 2
; CHECK-NEXT: blr
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
ret i32 %f
}
; Verify that weird types are minimally supported.
declare i37 @llvm.fshr.i37(i37, i37, i37)
define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
; CHECK-LABEL: fshr_i37:
; CHECK: # %bb.0:
; CHECK-NEXT: lis 6, -8857
; CHECK-NEXT: subfic 7, 5, 37
; CHECK-NEXT: clrldi 5, 5, 27
; CHECK-NEXT: clrldi 9, 4, 27
; CHECK-NEXT: ori 6, 6, 51366
; CHECK-NEXT: clrldi 7, 7, 27
; CHECK-NEXT: sldi 6, 6, 32
; CHECK-NEXT: oris 6, 6, 3542
; CHECK-NEXT: ori 6, 6, 31883
; CHECK-NEXT: mulhdu 8, 5, 6
; CHECK-NEXT: mulhdu 6, 7, 6
; CHECK-NEXT: rldicl 8, 8, 59, 5
; CHECK-NEXT: rldicl 6, 6, 59, 5
; CHECK-NEXT: mulli 8, 8, 37
; CHECK-NEXT: mulli 6, 6, 37
; CHECK-NEXT: subf. 5, 8, 5
; CHECK-NEXT: sub 6, 7, 6
; CHECK-NEXT: srd 5, 9, 5
; CHECK-NEXT: sld 3, 3, 6
; CHECK-NEXT: or 3, 3, 5
; CHECK-NEXT: isel 3, 4, 3, 2
; CHECK-NEXT: blr
%f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
ret i37 %f
}
; extract(concat(0b1110000, 0b1111111) >> 2) = 0b0011111
declare i7 @llvm.fshr.i7(i7, i7, i7)
define i7 @fshr_i7_const_fold() {
; CHECK-LABEL: fshr_i7_const_fold:
; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 31
; CHECK-NEXT: blr
%f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
ret i7 %f
}
; With constant shift amount, this is rotate + insert (missing extended mnemonics).
define i32 @fshr_i32_const_shift(i32 %x, i32 %y) {
; CHECK-LABEL: fshr_i32_const_shift:
; CHECK: # %bb.0:
; CHECK-NEXT: rlwinm 4, 4, 23, 0, 31
; CHECK-NEXT: rlwimi 4, 3, 23, 0, 8
; CHECK-NEXT: mr 3, 4
; CHECK-NEXT: blr
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
ret i32 %f
}
; Check modulo math on shift amount. 41-32=9.
define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) {
; CHECK-LABEL: fshr_i32_const_overshift:
; CHECK: # %bb.0:
; CHECK-NEXT: rlwinm 4, 4, 23, 0, 31
; CHECK-NEXT: rlwimi 4, 3, 23, 0, 8
; CHECK-NEXT: mr 3, 4
; CHECK-NEXT: blr
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
ret i32 %f
}
; 64-bit should also work. 105-64 = 41.
define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
; CHECK-LABEL: fshr_i64_const_overshift:
; CHECK: # %bb.0:
; CHECK-NEXT: rotldi 4, 4, 23
; CHECK-NEXT: rldimi 4, 3, 23, 0
; CHECK-NEXT: mr 3, 4
; CHECK-NEXT: blr
%f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
ret i64 %f
}
; This should work without any node-specific logic.
define i8 @fshr_i8_const_fold() {
; CHECK-LABEL: fshr_i8_const_fold:
; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 254
; CHECK-NEXT: blr
%f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
ret i8 %f
}
define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) {
; CHECK-LABEL: fshl_i32_shift_by_bitwidth:
; CHECK: # %bb.0:
; CHECK-NEXT: blr
%f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
ret i32 %f
}
define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) {
; CHECK-LABEL: fshr_i32_shift_by_bitwidth:
; CHECK: # %bb.0:
; CHECK-NEXT: mr 3, 4
; CHECK-NEXT: blr
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
ret i32 %f
}
define <4 x i32> @fshl_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: fshl_v4i32_shift_by_bitwidth:
; CHECK: # %bb.0:
; CHECK-NEXT: blr
%f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
ret <4 x i32> %f
}
define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: fshr_v4i32_shift_by_bitwidth:
; CHECK: # %bb.0:
; CHECK-NEXT: vmr 2, 3
; CHECK-NEXT: blr
%f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
ret <4 x i32> %f
}