We would like to optimize situations of the form that happen after loop
vectorization+SROA:
```
loop:
%phi = phi zeroinitializer, %interleaved
%deinterleave_a = shufflevector %phi, poison ; pick half of the lanes
%deinterleave_b = shufflevector %phi, posion ; pick remaining lanes
... %a = ... %b = ...
%interleaved = shufflevector %a, %b ; interleave lanes of a+b
```
where the interleave and de-interleave shuffle operations cancel each
other out.
This could be handled by `foldOpPhi` but does not currently work because
it does
not proceed when there are multiple uses of the `Phi` operation.
This extends `foldOpPhi` to allow multiple `shufflevector` uses when
they are
shown to simplify for all `Phi` input values.
116 lines
5.1 KiB
LLVM
116 lines
5.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: opt < %s -S -passes=instcombine | FileCheck %s
|
|
|
|
define <4 x i16> @f0(i1 %c, ptr %p0, ptr %p1) {
|
|
; CHECK-LABEL: define <4 x i16> @f0(
|
|
; CHECK-SAME: i1 [[C:%.*]], ptr [[P0:%.*]], ptr [[P1:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*]]:
|
|
; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[MERGE:.*]]
|
|
; CHECK: [[THEN]]:
|
|
; CHECK-NEXT: [[LOAD0:%.*]] = load <4 x i16>, ptr [[P0]], align 16
|
|
; CHECK-NEXT: [[LOAD1:%.*]] = load <4 x i16>, ptr [[P1]], align 16
|
|
; CHECK-NEXT: [[TMP0:%.*]] = sub <4 x i16> [[LOAD0]], [[LOAD1]]
|
|
; CHECK-NEXT: br label %[[MERGE]]
|
|
; CHECK: [[MERGE]]:
|
|
; CHECK-NEXT: [[SUB:%.*]] = phi <4 x i16> [ <i16 -87, i16 327, i16 51, i16 755>, %[[ENTRY]] ], [ [[TMP0]], %[[THEN]] ]
|
|
; CHECK-NEXT: ret <4 x i16> [[SUB]]
|
|
;
|
|
entry:
|
|
br i1 %c, label %then, label %merge
|
|
|
|
then:
|
|
%load0 = load <4 x i16>, ptr %p0, align 16
|
|
%load1 = load <4 x i16>, ptr %p1, align 16
|
|
%interleave = shufflevector <4 x i16> %load0, <4 x i16> %load1, <8 x i32> <i32 0, i32 7, i32 1, i32 6, i32 2, i32 5, i32 3, i32 4>
|
|
br label %merge
|
|
|
|
merge:
|
|
%phi = phi <8 x i16> [<i16 1, i16 22, i16 333, i16 4, i16 55, i16 6, i16 777, i16 88>, %entry], [%interleave, %then]
|
|
%shuf0 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%shuf1 = shufflevector <8 x i16> %phi, <8 x i16> poison, <4 x i32> <i32 7, i32 5, i32 3, i32 1>
|
|
%sub = sub <4 x i16> %shuf0, %shuf1
|
|
ret <4 x i16> %sub
|
|
}
|
|
|
|
define void @deinterleave_interleave(ptr %p_begin, ptr %p_end, ptr %out) {
|
|
; CHECK-LABEL: define void @deinterleave_interleave(
|
|
; CHECK-SAME: ptr [[P_BEGIN:%.*]], ptr [[P_END:%.*]], ptr [[OUT:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*]]:
|
|
; CHECK-NEXT: br label %[[LOOP:.*]]
|
|
; CHECK: [[LOOP]]:
|
|
; CHECK-NEXT: [[ACC:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_LOWS:%.*]], %[[LOOP]] ]
|
|
; CHECK-NEXT: [[ODDS:%.*]] = phi <4 x float> [ zeroinitializer, %[[ENTRY]] ], [ [[SUM_HIGHS:%.*]], %[[LOOP]] ]
|
|
; CHECK-NEXT: [[P:%.*]] = phi ptr [ [[P_BEGIN]], %[[ENTRY]] ], [ [[P_INC:%.*]], %[[LOOP]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = load <4 x i8>, ptr [[P]], align 4
|
|
; CHECK-NEXT: [[HIGHS:%.*]] = ashr <4 x i8> [[VAL]], splat (i8 4)
|
|
; CHECK-NEXT: [[LOWS:%.*]] = and <4 x i8> [[VAL]], splat (i8 15)
|
|
; CHECK-NEXT: [[HIGHS_F:%.*]] = sitofp <4 x i8> [[HIGHS]] to <4 x float>
|
|
; CHECK-NEXT: [[LOWS_F:%.*]] = uitofp nneg <4 x i8> [[LOWS]] to <4 x float>
|
|
; CHECK-NEXT: [[SUM_LOWS]] = fadd <4 x float> [[ACC]], [[LOWS_F]]
|
|
; CHECK-NEXT: [[SUM_HIGHS]] = fadd <4 x float> [[ODDS]], [[HIGHS_F]]
|
|
; CHECK-NEXT: [[P_INC]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
|
|
; CHECK-NEXT: [[C:%.*]] = icmp eq ptr [[P_INC]], [[P_END]]
|
|
; CHECK-NEXT: br i1 [[C]], label %[[EXIT:.*]], label %[[LOOP]]
|
|
; CHECK: [[EXIT]]:
|
|
; CHECK-NEXT: [[INTERLEAVE:%.*]] = shufflevector <4 x float> [[SUM_LOWS]], <4 x float> [[SUM_HIGHS]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
|
|
; CHECK-NEXT: store <8 x float> [[INTERLEAVE]], ptr [[OUT]], align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%acc = phi <8 x float> [ zeroinitializer, %entry ], [ %interleave, %loop ]
|
|
%p = phi ptr [%p_begin, %entry ], [%p_inc, %loop]
|
|
|
|
%evens = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%odds = shufflevector <8 x float> %acc, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
|
|
|
%val = load <4 x i8>, ptr %p, align 4
|
|
%highs = ashr <4 x i8> %val, <i8 4, i8 4, i8 4, i8 4>
|
|
%lows = and <4 x i8> %val, <i8 15, i8 15, i8 15, i8 15>
|
|
|
|
%highs_f = sitofp <4 x i8> %highs to <4 x float>
|
|
%lows_f = sitofp <4 x i8> %lows to <4 x float>
|
|
|
|
%sum_lows = fadd <4 x float> %evens, %lows_f
|
|
%sum_highs = fadd <4 x float> %odds, %highs_f
|
|
|
|
%interleave = shufflevector <4 x float> %sum_lows, <4 x float> %sum_highs, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
|
|
|
|
%p_inc = getelementptr inbounds i8, ptr %p, i32 4
|
|
%c = icmp eq ptr %p_inc, %p_end
|
|
br i1 %c, label %exit, label %loop
|
|
|
|
exit:
|
|
store <8 x float> %interleave, ptr %out, align 4
|
|
ret void
|
|
}
|
|
|
|
define <4 x i16> @f1(i1 %c, ptr %p) {
|
|
; CHECK-LABEL: define <4 x i16> @f1(
|
|
; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*]]:
|
|
; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[MERGE:.*]]
|
|
; CHECK: [[THEN]]:
|
|
; CHECK-NEXT: store i32 42, ptr [[P]], align 4
|
|
; CHECK-NEXT: br label %[[MERGE]]
|
|
; CHECK: [[MERGE]]:
|
|
; CHECK-NEXT: [[XOR:%.*]] = phi <4 x i16> [ <i16 3, i16 346, i16 undef, i16 undef>, %[[ENTRY]] ], [ <i16 7, i16 74, i16 undef, i16 undef>, %[[THEN]] ]
|
|
; CHECK-NEXT: ret <4 x i16> [[XOR]]
|
|
;
|
|
entry:
|
|
br i1 %c, label %then, label %merge
|
|
|
|
then:
|
|
store i32 42, ptr %p, align 4
|
|
br label %merge
|
|
|
|
merge:
|
|
%phi = phi <4 x i16> [<i16 1, i16 22, i16 333, i16 4>, %entry], [<i16 555, i16 6, i16 77, i16 8>, %then]
|
|
%shuf0 = shufflevector <4 x i16> %phi, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
|
%add1 = add <4 x i16> %phi, <i16 1, i16 1, i16 1, i16 1>
|
|
%xor = xor <4 x i16> %shuf0, %add1
|
|
ret <4 x i16> %xor
|
|
}
|