The semantics of tail predication loops means that the value of LR as an instruction is executed determines the predicate. In other words: mov r3, #3 DLSTP lr, r3 // Start tail predication, lr==3 VADD.s32 q0, q1, q2 // Lanes 0,1 and 2 are updated in q0. mov lr, #1 VADD.s32 q0, q1, q2 // Only first lane is updated. This means that the value of lr cannot be spilled and re-used in tail predication regions without potentially altering the behaviour of the program. More lanes than required could be stored, for example, and in the case of a gather those lanes might not have been setup, leading to alignment exceptions. This patch adds a new lr predicate operand to MVE instructions in order to keep a reference to the lr that they use as a tail predicate. It will usually hold the zeroreg meaning not predicated, being set to the LR phi value in the MVETPAndVPTOptimisationsPass. This will prevent it from being spilled anywhere that it needs to be used. A lot of tests needed updating. Differential Revision: https://reviews.llvm.org/D107638
185 lines
8.6 KiB
YAML
185 lines
8.6 KiB
YAML
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
|
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s
|
|
|
|
# Test that the scalar register that aliases a Q reg prevents the tail
|
|
# predication.
|
|
|
|
--- |
|
|
define dso_local i32 @no_vpsel_liveout(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
|
|
entry:
|
|
%cmp9 = icmp eq i32 %N, 0
|
|
%tmp = add i32 %N, 3
|
|
%tmp1 = lshr i32 %tmp, 2
|
|
%tmp2 = shl nuw i32 %tmp1, 2
|
|
%tmp3 = add i32 %tmp2, -4
|
|
%tmp4 = lshr i32 %tmp3, 2
|
|
%tmp5 = add nuw nsw i32 %tmp4, 1
|
|
br i1 %cmp9, label %for.cond.cleanup, label %vector.ph
|
|
|
|
vector.ph: ; preds = %entry
|
|
%start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp5)
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%lsr.iv1 = phi i32 [ %lsr.iv.next, %vector.body ], [ %start, %vector.ph ]
|
|
%lsr.iv18 = phi i16* [ %scevgep19, %vector.body ], [ %b, %vector.ph ]
|
|
%lsr.iv = phi i16* [ %scevgep, %vector.body ], [ %a, %vector.ph ]
|
|
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %tmp13, %vector.body ]
|
|
%tmp7 = phi i32 [ %N, %vector.ph ], [ %tmp9, %vector.body ]
|
|
%lsr.iv17 = bitcast i16* %lsr.iv to <4 x i16>*
|
|
%lsr.iv1820 = bitcast i16* %lsr.iv18 to <4 x i16>*
|
|
%tmp8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp7)
|
|
%tmp9 = sub i32 %tmp7, 4
|
|
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv17, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
|
|
%tmp10 = sext <4 x i16> %wide.masked.load to <4 x i32>
|
|
%wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv1820, i32 2, <4 x i1> %tmp8, <4 x i16> undef)
|
|
%tmp11 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
|
|
%tmp12 = mul nsw <4 x i32> %tmp11, %tmp10
|
|
%tmp13 = add <4 x i32> %tmp12, %vec.phi
|
|
%scevgep = getelementptr i16, i16* %lsr.iv, i32 4
|
|
%scevgep19 = getelementptr i16, i16* %lsr.iv18, i32 4
|
|
%tmp14 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv1, i32 1)
|
|
%tmp15 = icmp ne i32 %tmp14, 0
|
|
%lsr.iv.next = add nsw i32 %lsr.iv1, -1
|
|
br i1 %tmp15, label %vector.body, label %middle.block
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%tmp16 = extractelement <4 x i32> %tmp13, i32 3
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
%res.0.lcssa = phi i32 [ 0, %entry ], [ %tmp16, %middle.block ]
|
|
ret i32 %res.0.lcssa
|
|
}
|
|
declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
|
|
declare i32 @llvm.start.loop.iterations.i32(i32)
|
|
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
|
|
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
|
|
|
|
...
|
|
---
|
|
name: no_vpsel_liveout
|
|
alignment: 2
|
|
exposesReturnsTwice: false
|
|
legalized: false
|
|
regBankSelected: false
|
|
selected: false
|
|
failedISel: false
|
|
tracksRegLiveness: true
|
|
hasWinCFI: false
|
|
registers: []
|
|
liveins:
|
|
- { reg: '$r0', virtual-reg: '' }
|
|
- { reg: '$r1', virtual-reg: '' }
|
|
- { reg: '$r2', virtual-reg: '' }
|
|
frameInfo:
|
|
isFrameAddressTaken: false
|
|
isReturnAddressTaken: false
|
|
hasStackMap: false
|
|
hasPatchPoint: false
|
|
stackSize: 8
|
|
offsetAdjustment: 0
|
|
maxAlignment: 4
|
|
adjustsStack: false
|
|
hasCalls: false
|
|
stackProtector: ''
|
|
maxCallFrameSize: 0
|
|
cvBytesOfCalleeSavedRegisters: 0
|
|
hasOpaqueSPAdjustment: false
|
|
hasVAStart: false
|
|
hasMustTailInVarArgFunc: false
|
|
localFrameSize: 0
|
|
savePoint: ''
|
|
restorePoint: ''
|
|
fixedStack: []
|
|
stack:
|
|
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
|
|
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
|
|
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
|
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
|
|
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
|
|
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
|
callSites: []
|
|
constants: []
|
|
machineFunctionInfo: {}
|
|
body: |
|
|
; CHECK-LABEL: name: no_vpsel_liveout
|
|
; CHECK: bb.0.entry:
|
|
; CHECK: successors: %bb.1(0x80000000)
|
|
; CHECK: liveins: $lr, $r0, $r1, $r2, $r7
|
|
; CHECK: tCMPi8 renamable $r2, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr
|
|
; CHECK: t2IT 0, 4, implicit-def $itstate
|
|
; CHECK: renamable $r0 = tMOVi8 $noreg, 0, 0 /* CC::eq */, $cpsr, implicit killed $r0, implicit $itstate
|
|
; CHECK: tBX_RET 0 /* CC::eq */, killed $cpsr, implicit $r0, implicit killed $itstate
|
|
; CHECK: bb.1.vector.ph:
|
|
; CHECK: successors: %bb.2(0x80000000)
|
|
; CHECK: liveins: $lr, $r0, $r1, $r2, $r7
|
|
; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
|
|
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
|
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
|
|
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
|
|
; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, $noreg, undef renamable $q0
|
|
; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2
|
|
; CHECK: bb.2.vector.body:
|
|
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
|
|
; CHECK: liveins: $lr, $q0, $r0, $r1
|
|
; CHECK: renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 0, $noreg, $noreg :: (load (s64) from %ir.lsr.iv17, align 2)
|
|
; CHECK: renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 0, killed $noreg, $noreg :: (load (s64) from %ir.lsr.iv1820, align 2)
|
|
; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, $noreg, undef renamable $q1
|
|
; CHECK: renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, $noreg, undef renamable $q0
|
|
; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2
|
|
; CHECK: bb.3.middle.block:
|
|
; CHECK: liveins: $q0
|
|
; CHECK: $r0 = VMOVRS killed $s3, 14 /* CC::al */, $noreg, implicit killed $q0
|
|
; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $r0
|
|
bb.0.entry:
|
|
successors: %bb.1(0x80000000)
|
|
liveins: $r0, $r1, $r2, $lr, $r7
|
|
|
|
tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr
|
|
t2IT 0, 4, implicit-def $itstate
|
|
renamable $r0 = tMOVi8 $noreg, 0, 0, $cpsr, implicit killed $r0, implicit $itstate
|
|
tBX_RET 0, killed $cpsr, implicit $r0, implicit killed $itstate
|
|
|
|
bb.1.vector.ph:
|
|
successors: %bb.2(0x80000000)
|
|
liveins: $r0, $r1, $r2, $lr, $r7
|
|
|
|
frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
|
|
frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
|
frame-setup CFI_INSTRUCTION offset $lr, -4
|
|
frame-setup CFI_INSTRUCTION offset $r7, -8
|
|
renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg
|
|
renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, $noreg, undef renamable $q0
|
|
renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg
|
|
renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
|
|
renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
|
|
renamable $r12 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
|
|
$lr = t2DoLoopStart renamable $r12
|
|
$r3 = tMOVr killed $r12, 14, $noreg
|
|
|
|
bb.2.vector.body:
|
|
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
|
|
liveins: $q0, $r0, $r1, $r2, $r3
|
|
|
|
renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg, $noreg
|
|
MVE_VPST 4, implicit $vpr
|
|
renamable $r0, renamable $q1 = MVE_VLDRHS32_post killed renamable $r0, 8, 1, renamable $vpr, $noreg :: (load (s64) from %ir.lsr.iv17, align 2)
|
|
renamable $r1, renamable $q2 = MVE_VLDRHS32_post killed renamable $r1, 8, 1, killed renamable $vpr, $noreg :: (load (s64) from %ir.lsr.iv1820, align 2)
|
|
$lr = tMOVr $r3, 14, $noreg
|
|
renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, $noreg, undef renamable $q1
|
|
renamable $r3, dead $cpsr = nsw tSUBi8 killed $r3, 1, 14, $noreg
|
|
renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
|
|
renamable $q0 = MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, $noreg, undef renamable $q0
|
|
renamable $lr = t2LoopDec killed renamable $lr, 1
|
|
t2LoopEnd killed renamable $lr, %bb.2, implicit-def dead $cpsr
|
|
tB %bb.3, 14, $noreg
|
|
|
|
bb.3.middle.block:
|
|
liveins: $q0
|
|
|
|
$r0 = VMOVRS killed $s3, 14, $noreg, implicit $q0
|
|
tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
|
|
|
|
...
|