The semantics of tail predication loops means that the value of LR as an instruction is executed determines the predicate. In other words: mov r3, #3 DLSTP lr, r3 // Start tail predication, lr==3 VADD.s32 q0, q1, q2 // Lanes 0,1 and 2 are updated in q0. mov lr, #1 VADD.s32 q0, q1, q2 // Only first lane is updated. This means that the value of lr cannot be spilled and re-used in tail predication regions without potentially altering the behaviour of the program. More lanes than required could be stored, for example, and in the case of a gather those lanes might not have been setup, leading to alignment exceptions. This patch adds a new lr predicate operand to MVE instructions in order to keep a reference to the lr that they use as a tail predicate. It will usually hold the zeroreg meaning not predicated, being set to the LR phi value in the MVETPAndVPTOptimisationsPass. This will prevent it from being spilled anywhere that it needs to be used. A lot of tests needed updating. Differential Revision: https://reviews.llvm.org/D107638
187 lines
9.1 KiB
YAML
187 lines
9.1 KiB
YAML
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
|
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve,+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
|
|
|
|
# There are 2 SUBS, so don't use tail predication
|
|
|
|
--- |
|
|
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
|
|
target triple = "thumbv8.1m.main-arm-unknown-eabi"
|
|
|
|
define dso_local void @use_before_def(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
|
|
entry:
|
|
%cmp8 = icmp sgt i32 %N, 0
|
|
%0 = add i32 %N, 3
|
|
%1 = lshr i32 %0, 2
|
|
%2 = shl nuw i32 %1, 2
|
|
%3 = add i32 %2, -4
|
|
%4 = lshr i32 %3, 2
|
|
%5 = add nuw nsw i32 %4, 1
|
|
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
|
|
%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
|
|
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
|
|
%6 = phi i32 [ %start, %vector.ph ], [ %11, %vector.body ]
|
|
%7 = phi i32 [ %N, %vector.ph ], [ %9, %vector.body ]
|
|
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
|
|
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
|
|
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
|
|
%8 = call <4 x i1> @llvm.arm.vctp32(i32 %7)
|
|
%9 = sub i32 %7, 4
|
|
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %8, <4 x i32> undef)
|
|
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %8, <4 x i32> undef)
|
|
%10 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
|
|
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %8)
|
|
%scevgep = getelementptr i32, i32* %lsr.iv, i32 4
|
|
%scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
|
|
%scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
|
|
%11 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1)
|
|
%12 = icmp ne i32 %11, 0
|
|
br i1 %12, label %vector.body, label %for.cond.cleanup
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
declare i32 @llvm.start.loop.iterations.i32(i32)
|
|
declare <4 x i1> @llvm.arm.vctp32(i32)
|
|
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
|
|
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
|
|
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
|
|
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
|
|
|
|
...
|
|
---
|
|
name: use_before_def
|
|
alignment: 2
|
|
exposesReturnsTwice: false
|
|
legalized: false
|
|
regBankSelected: false
|
|
selected: false
|
|
failedISel: false
|
|
tracksRegLiveness: true
|
|
hasWinCFI: false
|
|
registers: []
|
|
liveins:
|
|
- { reg: '$r0', virtual-reg: '' }
|
|
- { reg: '$r1', virtual-reg: '' }
|
|
- { reg: '$r2', virtual-reg: '' }
|
|
- { reg: '$r3', virtual-reg: '' }
|
|
frameInfo:
|
|
isFrameAddressTaken: false
|
|
isReturnAddressTaken: false
|
|
hasStackMap: false
|
|
hasPatchPoint: false
|
|
stackSize: 8
|
|
offsetAdjustment: 0
|
|
maxAlignment: 4
|
|
adjustsStack: false
|
|
hasCalls: false
|
|
stackProtector: ''
|
|
maxCallFrameSize: 0
|
|
cvBytesOfCalleeSavedRegisters: 0
|
|
hasOpaqueSPAdjustment: false
|
|
hasVAStart: false
|
|
hasMustTailInVarArgFunc: false
|
|
localFrameSize: 0
|
|
savePoint: ''
|
|
restorePoint: ''
|
|
fixedStack: []
|
|
stack:
|
|
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
|
|
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
|
|
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
|
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
|
|
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
|
|
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
|
|
callSites: []
|
|
constants: []
|
|
machineFunctionInfo: {}
|
|
body: |
|
|
; CHECK-LABEL: name: use_before_def
|
|
; CHECK: bb.0.entry:
|
|
; CHECK: successors: %bb.1(0x80000000)
|
|
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r7
|
|
; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
|
|
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
|
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4
|
|
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8
|
|
; CHECK: dead $r7 = frame-setup tMOVr $sp, 14 /* CC::al */, $noreg
|
|
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_register $r7
|
|
; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
|
|
; CHECK: t2IT 11, 8, implicit-def $itstate
|
|
; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate
|
|
; CHECK: bb.1.vector.ph:
|
|
; CHECK: successors: %bb.2(0x80000000)
|
|
; CHECK: liveins: $r0, $r1, $r2, $r3
|
|
; CHECK: renamable $r12 = t2ADDri renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg
|
|
; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
|
|
; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg
|
|
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg
|
|
; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg
|
|
; CHECK: bb.2.vector.body:
|
|
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000)
|
|
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3
|
|
; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg, $noreg
|
|
; CHECK: MVE_VPST 4, implicit $vpr
|
|
; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr, $lr :: (load (s128) from %ir.lsr.iv13, align 4)
|
|
; CHECK: renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr, $lr :: (load (s128) from %ir.lsr.iv1416, align 4)
|
|
; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
|
|
; CHECK: renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, $lr, undef renamable $q0
|
|
; CHECK: MVE_VPST 8, implicit $vpr
|
|
; CHECK: renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr, $lr :: (store (s128) into %ir.lsr.iv1719, align 4)
|
|
; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
|
|
; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2
|
|
; CHECK: bb.3.for.cond.cleanup:
|
|
; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc
|
|
bb.0.entry:
|
|
successors: %bb.1(0x80000000)
|
|
liveins: $r0, $r1, $r2, $r3, $r7, $lr
|
|
|
|
frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
|
|
frame-setup CFI_INSTRUCTION def_cfa_offset 8
|
|
frame-setup CFI_INSTRUCTION offset $lr, -4
|
|
frame-setup CFI_INSTRUCTION offset $r7, -8
|
|
$r7 = frame-setup tMOVr $sp, 14, $noreg
|
|
frame-setup CFI_INSTRUCTION def_cfa_register $r7
|
|
tCMPi8 renamable $r3, 1, 14, $noreg, implicit-def $cpsr
|
|
t2IT 11, 8, implicit-def $itstate
|
|
tPOP_RET 11, killed $cpsr, def $r7, def $pc, implicit killed $itstate
|
|
|
|
bb.1.vector.ph:
|
|
successors: %bb.2(0x80000000)
|
|
liveins: $r0, $r1, $r2, $r3, $r7, $lr
|
|
|
|
renamable $r12 = t2ADDri renamable $r3, 3, 14, $noreg, $noreg
|
|
renamable $lr = t2MOVi 1, 14, $noreg, $noreg
|
|
renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
|
|
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg
|
|
renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
|
|
$lr = t2DoLoopStart renamable $lr
|
|
|
|
bb.2.vector.body:
|
|
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
|
|
liveins: $lr, $r0, $r1, $r2, $r3
|
|
|
|
renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg, $noreg
|
|
MVE_VPST 4, implicit $vpr
|
|
renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr, $lr :: (load (s128) from %ir.lsr.iv13, align 4)
|
|
renamable $r2, renamable $q1 = MVE_VLDRWU32_post killed renamable $r2, 16, 1, renamable $vpr, $lr :: (load (s128) from %ir.lsr.iv1416, align 4)
|
|
renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
|
|
renamable $q0 = nsw MVE_VADDi32 killed renamable $q1, killed renamable $q0, 0, $noreg, $lr, undef renamable $q0
|
|
MVE_VPST 8, implicit $vpr
|
|
renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr, $lr :: (store (s128) into %ir.lsr.iv1719, align 4)
|
|
renamable $lr = t2LoopDec killed renamable $lr, 1
|
|
renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
|
|
t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr
|
|
tB %bb.3, 14, $noreg
|
|
|
|
bb.3.for.cond.cleanup:
|
|
tPOP_RET 14, $noreg, def $r7, def $pc
|
|
|
|
...
|