Files
clang-p2996/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
Nikita Popov a105877646 [InstCombine] Remove some of the complexity-based canonicalization (#91185)
The idea behind this canonicalization is that it allows us to handle less
patterns, because we know that some will be canonicalized away. This is
indeed very useful to e.g. know that constants are always on the right.

However, this is only useful if the canonicalization is actually
reliable. This is the case for constants, but not for arguments: Moving
these to the right makes it look like the "more complex" expression is
guaranteed to be on the left, but this is not actually the case in
practice. It fails as soon as you replace the argument with another
instruction.

The end result is that it looks like things correctly work in tests,
while they actually don't. We use the "thwart complexity-based
canonicalization" trick to handle this in tests, but it's often a
challenge for new contributors to get this right, and based on the
regressions this PR originally exposed, we clearly don't get this right
in many cases.

For this reason, I think that it's better to remove this complexity
canonicalization. It will make it much easier to write tests for
commuted cases and make sure that they are handled.
2024-08-21 12:02:54 +02:00

243 lines
12 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
target triple = "arm64-apple-darwin"
; Make sure we can vectorize a loop that uses a function to clamp a double to
; be between a given minimum and maximum value.
define internal double @clamp(double %v) {
entry:
%retval = alloca double, align 8
%v.addr = alloca double, align 8
store double %v, ptr %v.addr, align 8
%0 = load double, ptr %v.addr, align 8
%cmp = fcmp olt double %0, 0.000000e+00
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
store double 0.000000e+00, ptr %retval, align 8
br label %return
if.end: ; preds = %entry
%1 = load double, ptr %v.addr, align 8
%cmp1 = fcmp ogt double %1, 6.000000e+00
br i1 %cmp1, label %if.then2, label %if.end3
if.then2: ; preds = %if.end
store double 6.000000e+00, ptr %retval, align 8
br label %return
if.end3: ; preds = %if.end
%2 = load double, ptr %v.addr, align 8
store double %2, ptr %retval, align 8
br label %return
return: ; preds = %if.end3, %if.then2, %if.then
%3 = load double, ptr %retval, align 8
ret double %3
}
define void @loop(ptr %X, ptr %Y) {
; CHECK-LABEL: @loop(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[X6:%.*]] = ptrtoint ptr [[X:%.*]] to i64
; CHECK-NEXT: [[Y7:%.*]] = ptrtoint ptr [[Y:%.*]] to i64
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[X6]], [[Y7]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD8]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD8]], <double 6.000000e+00, double 6.000000e+00>
; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP5]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD8]]
; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x double> zeroinitializer, <2 x double> [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16
; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[TMP11]], align 8
; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[TMP12]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt double [[TMP14]], 0.000000e+00
; CHECK-NEXT: [[CMP1_I:%.*]] = fcmp ogt double [[TMP14]], 6.000000e+00
; CHECK-NEXT: [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP14]]
; CHECK-NEXT: [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]]
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store double [[RETVAL_0_I]], ptr [[ARRAYIDX2]], align 8
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20000
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
;
entry:
%X.addr = alloca ptr, align 8
%Y.addr = alloca ptr, align 8
%i = alloca i32, align 4
store ptr %X, ptr %X.addr, align 8
store ptr %Y, ptr %Y.addr, align 8
call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2
store i32 0, ptr %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, ptr %i, align 4
%cmp = icmp ult i32 %0, 20000
br i1 %cmp, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond
call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2
br label %for.end
for.body: ; preds = %for.cond
%1 = load ptr, ptr %Y.addr, align 8
%2 = load i32, ptr %i, align 4
%idxprom = zext i32 %2 to i64
%arrayidx = getelementptr inbounds double, ptr %1, i64 %idxprom
%3 = load double, ptr %arrayidx, align 8
%call = call double @clamp(double %3)
%4 = load ptr, ptr %X.addr, align 8
%5 = load i32, ptr %i, align 4
%idxprom1 = zext i32 %5 to i64
%arrayidx2 = getelementptr inbounds double, ptr %4, i64 %idxprom1
store double %call, ptr %arrayidx2, align 8
br label %for.inc
for.inc: ; preds = %for.body
%6 = load i32, ptr %i, align 4
%inc = add i32 %6, 1
store i32 %inc, ptr %i, align 4
br label %for.cond
for.end: ; preds = %for.cond.cleanup
ret void
}
; Test that requires sinking/hoisting of instructions for vectorization.
define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
; CHECK-LABEL: @loop2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 40000
; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 40000
; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 40000
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[B]], [[SCEVGEP3]]
; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[LOOP_BODY:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope [[META4:![0-9]+]]
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META4]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 20, i32 20, i32 20, i32 20>
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD7]], <i32 20, i32 20, i32 20, i32 20>
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !alias.scope [[META7:![0-9]+]]
; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !alias.scope [[META7]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD8]]
; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD9]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]]
; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META11]]
; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[TMP6]], [[WIDE_LOAD10]]
; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[WIDE_LOAD11]]
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]]
; CHECK-NEXT: [[PREDPHI12:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 16
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4, !alias.scope [[META9]], !noalias [[META11]]
; CHECK-NEXT: store <4 x float> [[PREDPHI12]], ptr [[TMP12]], align 4, !alias.scope [[META9]], !noalias [[META11]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
; CHECK-NEXT: br i1 [[TMP13]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: loop.body:
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[C_GEP:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV1]]
; CHECK-NEXT: [[C_LV:%.*]] = load i32, ptr [[C_GEP]], align 4
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[C_LV]], 20
; CHECK-NEXT: [[A_GEP_0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV1]]
; CHECK-NEXT: [[A_LV_0:%.*]] = load float, ptr [[A_GEP_0]], align 4
; CHECK-NEXT: [[MUL2_I81_I:%.*]] = fmul float [[X]], [[A_LV_0]]
; CHECK-NEXT: [[B_GEP_0:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV1]]
; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_LATCH]], label [[ELSE:%.*]]
; CHECK: else:
; CHECK-NEXT: [[B_LV:%.*]] = load float, ptr [[B_GEP_0]], align 4
; CHECK-NEXT: [[ADD:%.*]] = fadd float [[MUL2_I81_I]], [[B_LV]]
; CHECK-NEXT: br label [[LOOP_LATCH]]
; CHECK: loop.latch:
; CHECK-NEXT: [[ADD_SINK:%.*]] = phi float [ [[ADD]], [[ELSE]] ], [ [[MUL2_I81_I]], [[LOOP_BODY]] ]
; CHECK-NEXT: store float [[ADD_SINK]], ptr [[B_GEP_0]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10000
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
entry:
br label %loop.header
loop.header:
%iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %entry ]
%cmp.0 = icmp ult i64 %iv, 10000
br i1 %cmp.0, label %loop.body, label %exit
loop.body:
%C.gep = getelementptr inbounds i32, ptr %C, i64 %iv
%C.lv = load i32, ptr %C.gep
%cmp = icmp eq i32 %C.lv, 20
br i1 %cmp, label %then, label %else
then:
%A.gep.0 = getelementptr inbounds float, ptr %A, i64 %iv
%A.lv.0 = load float, ptr %A.gep.0, align 4
%mul2.i81.i = fmul float %A.lv.0, %x
%B.gep.0 = getelementptr inbounds float, ptr %B, i64 %iv
store float %mul2.i81.i, ptr %B.gep.0, align 4
br label %loop.latch
else:
%A.gep.1 = getelementptr inbounds float, ptr %A, i64 %iv
%A.lv.1 = load float, ptr %A.gep.1, align 4
%mul2 = fmul float %A.lv.1, %x
%B.gep.1 = getelementptr inbounds float, ptr %B, i64 %iv
%B.lv = load float, ptr %B.gep.1, align 4
%add = fadd float %mul2, %B.lv
store float %add, ptr %B.gep.1, align 4
br label %loop.latch
loop.latch:
%iv.next = add nuw nsw i64 %iv, 1
br label %loop.header
exit:
ret void
}
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)