diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 6cc0b681193f..a98b7a842092 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2325,6 +2325,59 @@ std::pair ARMTargetLowering::computeAddrForCallArg( return std::make_pair(DstAddr, DstInfo); } +// Returns the type of copying which is required to set up a byval argument to +// a tail-called function. This isn't needed for non-tail calls, because they +// always need the equivalent of CopyOnce, but tail-calls sometimes need two to +// avoid clobbering another argument (CopyViaTemp), and sometimes can be +// optimised to zero copies when forwarding an argument from the caller's +// caller (NoCopy). +ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall( + SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const { + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo(); + + // Globals are always safe to copy from. + if (isa(Src) || isa(Src)) + return CopyOnce; + + // Can only analyse frame index nodes, conservatively assume we need a + // temporary. + auto *SrcFrameIdxNode = dyn_cast(Src); + auto *DstFrameIdxNode = dyn_cast(Dst); + if (!SrcFrameIdxNode || !DstFrameIdxNode) + return CopyViaTemp; + + int SrcFI = SrcFrameIdxNode->getIndex(); + int DstFI = DstFrameIdxNode->getIndex(); + assert(MFI.isFixedObjectIndex(DstFI) && + "byval passed in non-fixed stack slot"); + + int64_t SrcOffset = MFI.getObjectOffset(SrcFI); + int64_t DstOffset = MFI.getObjectOffset(DstFI); + + // If the source is in the local frame, then the copy to the argument memory + // is always valid. + bool FixedSrc = MFI.isFixedObjectIndex(SrcFI); + if (!FixedSrc || + (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize())) + return CopyOnce; + + // In the case of byval arguments split between registers and the stack, + // computeAddrForCallArg returns a FrameIndex which corresponds only to the + // stack portion, but the Src SDValue will refer to the full value, including + // the local stack memory that the register portion gets stored into. We only + // need to compare them for equality, so normalise on the full value version. + uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI); + DstOffset -= RegSize; + + // If the value is already in the correct location, then no copying is + // needed. If not, then we need to copy via a temporary. + if (SrcOffset == DstOffset) + return NoCopy; + else + return CopyViaTemp; +} + void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, @@ -2499,37 +2552,58 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // overwritten by the stores of the outgoing arguments. To avoid this, we // need to make a temporary copy of them in local stack space, then copy back // to the argument area. - // TODO This could be optimised to skip byvals which are already being copied - // from local stack space, or which are copied from the incoming stack at the - // exact same location. DenseMap ByValTemporaries; SDValue ByValTempChain; if (isTailCall) { - for (unsigned ArgIdx = 0, e = OutVals.size(); ArgIdx != e; ++ArgIdx) { - SDValue Arg = OutVals[ArgIdx]; + SmallVector ByValCopyChains; + for (const CCValAssign &VA : ArgLocs) { + unsigned ArgIdx = VA.getValNo(); + SDValue Src = OutVals[ArgIdx]; ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags; - if (Flags.isByVal()) { - int FrameIdx = MFI.CreateStackObject( + if (!Flags.isByVal()) + continue; + + SDValue Dst; + MachinePointerInfo DstInfo; + std::tie(Dst, DstInfo) = + computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff); + ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags); + + if (Copy == NoCopy) { + // If the argument is already at the correct offset on the stack + // (because we are forwarding a byval argument from our caller), we + // don't need any copying. + continue; + } else if (Copy == CopyOnce) { + // If the argument is in our local stack frame, no other argument + // preparation can clobber it, so we can copy it to the final location + // later. + ByValTemporaries[ArgIdx] = Src; + } else { + assert(Copy == CopyViaTemp && "unexpected enum value"); + // If we might be copying this argument from the outgoing argument + // stack area, we need to copy via a temporary in the local stack + // frame. + int TempFrameIdx = MFI.CreateStackObject( Flags.getByValSize(), Flags.getNonZeroByValAlign(), false); - SDValue Dst = - DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); + SDValue Temp = + DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout())); SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); SDValue AlignNode = DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, Dst, Arg, SizeNode, AlignNode}; - MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, - Ops)); - ByValTemporaries[ArgIdx] = Dst; + SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode}; + ByValCopyChains.push_back( + DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); + ByValTemporaries[ArgIdx] = Temp; } } - if (!MemOpChains.empty()) { - ByValTempChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); - MemOpChains.clear(); - } + if (!ByValCopyChains.empty()) + ByValTempChain = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains); } // During a tail call, stores to the argument area must happen after all of @@ -2644,13 +2718,17 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); SDValue ByValSrc; - if (ByValTemporaries.contains(realArgIdx)) + bool NeedsStackCopy; + if (ByValTemporaries.contains(realArgIdx)) { ByValSrc = ByValTemporaries[realArgIdx]; - else + NeedsStackCopy = true; + } else { ByValSrc = Arg; + NeedsStackCopy = !isTailCall; + } + // If part of the argument is in registers, load them. if (CurByValIdx < ByValArgsCount) { - unsigned RegBegin, RegEnd; CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); @@ -2674,7 +2752,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CCInfo.nextInRegsParam(); } - if (Flags.getByValSize() > 4*offset) { + // If the memory part of the argument isn't already in the correct place + // (which can happen with tail calls), copy it into the argument area. + if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) { auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Dst; MachinePointerInfo DstInfo; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index ef651bc3d84c..0e086f3340cc 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -395,6 +395,19 @@ class VectorType; // ARMTargetLowering - ARM Implementation of the TargetLowering interface class ARMTargetLowering : public TargetLowering { + // Copying needed for an outgoing byval argument. + enum ByValCopyKind { + // Argument is already in the correct location, no copy needed. + NoCopy, + // Argument value is currently in the local stack frame, needs copying to + // outgoing arguemnt area. + CopyOnce, + // Argument value is currently in the outgoing argument area, but not at + // the correct offset, so needs copying via a temporary in local stack + // space. + CopyViaTemp, + }; + public: explicit ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI); @@ -809,6 +822,9 @@ class VectorType; computeAddrForCallArg(const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr, bool IsTailCall, int SPDiff) const; + ByValCopyKind ByValNeedsCopyForTailCall(SelectionDAG &DAG, SDValue Src, + SDValue Dst, + ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/ARM/musttail.ll b/llvm/test/CodeGen/ARM/musttail.ll index 3c577134f696..aa7b81d07002 100644 --- a/llvm/test/CodeGen/ARM/musttail.ll +++ b/llvm/test/CodeGen/ARM/musttail.ll @@ -129,40 +129,15 @@ declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4) ; actually passed in registers and the stack in the same way for the caller and ; callee. Within @large_caller the first 16 bytes of the argument are spilled ; to the local stack frame, but for the tail-call they are passed in r0-r3, so -; it's safe to de-allocate that memory before the call. Most of the code -; generated for this isn't needed, but that's a missed optimisation, not a -; correctness issue. +; it's safe to de-allocate that memory before the call. +; TODO: The SUB and STM instructions are unnecessary and could be optimised +; out, but the behaviour of this is still correct. define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { ; CHECK-LABEL: large_caller: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, sp, #20 -; CHECK-NEXT: add r12, sp, #28 -; CHECK-NEXT: add lr, sp, #44 -; CHECK-NEXT: stm r12, {r0, r1, r2, r3} -; CHECK-NEXT: add r0, sp, #28 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: add r12, r1, #16 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldm sp, {r0, r1, r2, r3} -; CHECK-NEXT: ldr r4, [r12], #4 -; CHECK-NEXT: str r4, [lr], #4 -; CHECK-NEXT: add sp, sp, #20 -; CHECK-NEXT: pop {r4, lr} -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: stm sp!, {r0, r1, r2, r3} ; CHECK-NEXT: b large_callee entry: musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a) @@ -176,34 +151,10 @@ define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4 ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .pad #20 -; CHECK-NEXT: sub sp, sp, #20 -; CHECK-NEXT: add r12, sp, #28 -; CHECK-NEXT: add lr, sp, #44 -; CHECK-NEXT: stm r12, {r0, r1, r2, r3} +; CHECK-NEXT: stm sp, {r0, r1, r2, r3} ; CHECK-NEXT: @APP ; CHECK-NEXT: @NO_APP -; CHECK-NEXT: add r0, sp, #28 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: add r12, r1, #16 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldm sp, {r0, r1, r2, r3} -; CHECK-NEXT: ldr r4, [r12], #4 -; CHECK-NEXT: str r4, [lr], #4 -; CHECK-NEXT: add sp, sp, #20 -; CHECK-NEXT: pop {r4, lr} -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: pop {r0, r1, r2, r3} ; CHECK-NEXT: b large_callee entry: tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"() @@ -218,44 +169,30 @@ entry: define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { ; CHECK-LABEL: large_caller_new_value: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, sp, #40 -; CHECK-NEXT: add r12, sp, #48 -; CHECK-NEXT: add lr, sp, #64 +; CHECK-NEXT: .pad #36 +; CHECK-NEXT: sub sp, sp, #36 +; CHECK-NEXT: add r12, sp, #20 ; CHECK-NEXT: stm r12, {r0, r1, r2, r3} ; CHECK-NEXT: mov r0, #4 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: str r0, [sp, #36] +; CHECK-NEXT: add r1, sp, #36 +; CHECK-NEXT: str r0, [sp, #16] ; CHECK-NEXT: mov r0, #3 -; CHECK-NEXT: str r0, [sp, #32] +; CHECK-NEXT: str r0, [sp, #12] ; CHECK-NEXT: mov r0, #2 -; CHECK-NEXT: str r0, [sp, #28] +; CHECK-NEXT: str r0, [sp, #8] ; CHECK-NEXT: mov r0, #1 -; CHECK-NEXT: str r0, [sp, #24] +; CHECK-NEXT: str r0, [sp, #4] ; CHECK-NEXT: mov r0, #0 -; CHECK-NEXT: str r0, [sp, #20] -; CHECK-NEXT: add r0, sp, #20 -; CHECK-NEXT: add r12, r1, #16 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: add r0, r0, #16 +; CHECK-NEXT: mov r3, #3 ; CHECK-NEXT: ldr r2, [r0], #4 ; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldr r2, [r0], #4 -; CHECK-NEXT: str r2, [r1], #4 -; CHECK-NEXT: ldm sp, {r0, r1, r2, r3} -; CHECK-NEXT: ldr r4, [r12], #4 -; CHECK-NEXT: str r4, [lr], #4 -; CHECK-NEXT: add sp, sp, #40 -; CHECK-NEXT: pop {r4, lr} -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: mov r1, #1 +; CHECK-NEXT: mov r2, #2 +; CHECK-NEXT: add sp, sp, #36 ; CHECK-NEXT: b large_callee entry: %y = alloca %twenty_bytes, align 4 @@ -335,3 +272,71 @@ entry: musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a) ret void } + +; A forwarded byval arg, but at a different offset on the stack, so it needs to +; be copied to the local stack frame first. This can't be musttail because of +; the different signatures, but is still tail-called as an optimisation. +declare void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4) +define void @shift_byval(i32 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) { +; CHECK-LABEL: shift_byval: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #12 +; CHECK-NEXT: sub sp, sp, #12 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, sp, #20 +; CHECK-NEXT: add r0, sp, #28 +; CHECK-NEXT: add lr, sp, #40 +; CHECK-NEXT: stm r0, {r1, r2, r3} +; CHECK-NEXT: add r0, sp, #28 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: add r12, r1, #16 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldr r2, [r0], #4 +; CHECK-NEXT: str r2, [r1], #4 +; CHECK-NEXT: ldm sp, {r0, r1, r2, r3} +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: str r4, [lr], #4 +; CHECK-NEXT: add sp, sp, #20 +; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: add sp, sp, #12 +; CHECK-NEXT: b shift_byval_callee +entry: + tail call void @shift_byval_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b) + ret void +} + +; A global object passed to a byval argument, so it must be copied, but doesn't +; need a stack temporary. +@large_global = external global %twenty_bytes +define void @large_caller_from_global(%twenty_bytes* byval(%twenty_bytes) align 4 %a) { +; CHECK-LABEL: large_caller_from_global: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: add r12, sp, #8 +; CHECK-NEXT: add lr, sp, #24 +; CHECK-NEXT: stm r12, {r0, r1, r2, r3} +; CHECK-NEXT: movw r3, :lower16:large_global +; CHECK-NEXT: movt r3, :upper16:large_global +; CHECK-NEXT: add r12, r3, #16 +; CHECK-NEXT: ldm r3, {r0, r1, r2, r3} +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: str r4, [lr], #4 +; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: b large_callee +entry: + musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 @large_global) + ret void +}