[NVPTX] Further cleanup call isel (#146411)

This change continues rewriting and cleanup around DAG ISel for
formal-arguments, return values, and function calls. This causes some
incidental changes, mostly to instruction ordering and register naming
but also a couple improvements caused by using scalar types earlier in
the lowering.
This commit is contained in:
Alex MacLean
2025-07-01 14:55:04 -07:00
committed by GitHub
parent 5ed852f7f7
commit 475cd8dfaf
16 changed files with 2086 additions and 2089 deletions

View File

@@ -28,6 +28,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -390,35 +391,27 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
/// and promote them to a larger size if they're not.
///
/// The promoted type is placed in \p PromoteVT if the function returns true.
static std::optional<MVT> PromoteScalarIntegerPTX(const EVT &VT) {
static EVT promoteScalarIntegerPTX(const EVT VT) {
if (VT.isScalarInteger()) {
MVT PromotedVT;
switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
default:
llvm_unreachable(
"Promotion is not suitable for scalars of size larger than 64-bits");
case 1:
PromotedVT = MVT::i1;
break;
return MVT::i1;
case 2:
case 4:
case 8:
PromotedVT = MVT::i8;
break;
return MVT::i8;
case 16:
PromotedVT = MVT::i16;
break;
return MVT::i16;
case 32:
PromotedVT = MVT::i32;
break;
return MVT::i32;
case 64:
PromotedVT = MVT::i64;
break;
return MVT::i64;
}
if (VT != PromotedVT)
return PromotedVT;
}
return std::nullopt;
return VT;
}
// Check whether we can merge loads/stores of some of the pieces of a
@@ -1053,10 +1046,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
break;
MAKE_CASE(NVPTXISD::RET_GLUE)
MAKE_CASE(NVPTXISD::DeclareParam)
MAKE_CASE(NVPTXISD::DeclareArrayParam)
MAKE_CASE(NVPTXISD::DeclareScalarParam)
MAKE_CASE(NVPTXISD::DeclareRet)
MAKE_CASE(NVPTXISD::DeclareRetParam)
MAKE_CASE(NVPTXISD::CALL)
MAKE_CASE(NVPTXISD::LoadParam)
MAKE_CASE(NVPTXISD::LoadParamV2)
@@ -1162,8 +1153,8 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
}
std::string NVPTXTargetLowering::getPrototype(
const DataLayout &DL, Type *retTy, const ArgListTy &Args,
const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign RetAlign,
const DataLayout &DL, Type *RetTy, const ArgListTy &Args,
const SmallVectorImpl<ISD::OutputArg> &Outs,
std::optional<unsigned> FirstVAArg, const CallBase &CB,
unsigned UniqueCallSite) const {
auto PtrVT = getPointerTy(DL);
@@ -1172,22 +1163,22 @@ std::string NVPTXTargetLowering::getPrototype(
raw_string_ostream O(Prototype);
O << "prototype_" << UniqueCallSite << " : .callprototype ";
if (retTy->isVoidTy()) {
if (RetTy->isVoidTy()) {
O << "()";
} else {
O << "(";
if (shouldPassAsArray(retTy)) {
assert(RetAlign && "RetAlign must be set for non-void return types");
O << ".param .align " << RetAlign->value() << " .b8 _["
<< DL.getTypeAllocSize(retTy) << "]";
} else if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
if (shouldPassAsArray(RetTy)) {
const Align RetAlign = getArgumentAlignment(&CB, RetTy, 0, DL);
O << ".param .align " << RetAlign.value() << " .b8 _["
<< DL.getTypeAllocSize(RetTy) << "]";
} else if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy()) {
unsigned size = 0;
if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
size = ITy->getBitWidth();
} else {
assert(retTy->isFloatingPointTy() &&
assert(RetTy->isFloatingPointTy() &&
"Floating point type expected here");
size = retTy->getPrimitiveSizeInBits();
size = RetTy->getPrimitiveSizeInBits();
}
// PTX ABI requires all scalar return values to be at least 32
// bits in size. fp16 normally uses .b16 as its storage type in
@@ -1195,7 +1186,7 @@ std::string NVPTXTargetLowering::getPrototype(
size = promoteScalarArgumentSize(size);
O << ".param .b" << size << " _";
} else if (isa<PointerType>(retTy)) {
} else if (isa<PointerType>(RetTy)) {
O << ".param .b" << PtrVT.getSizeInBits() << " _";
} else {
llvm_unreachable("Unknown return type");
@@ -1256,7 +1247,7 @@ std::string NVPTXTargetLowering::getPrototype(
if (FirstVAArg)
O << (first ? "" : ",") << " .param .align "
<< STI.getMaxRequiredAlignment() << " .b8 _[]\n";
<< STI.getMaxRequiredAlignment() << " .b8 _[]";
O << ")";
if (shouldEmitPTXNoReturn(&CB, *nvTM))
O << " .noreturn";
@@ -1442,6 +1433,21 @@ static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags) {
return ISD::ANY_EXTEND;
}
static SDValue correctParamType(SDValue V, EVT ExpectedVT,
ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
SDLoc dl) {
const EVT ActualVT = V.getValueType();
assert((ActualVT == ExpectedVT ||
(ExpectedVT.isInteger() && ActualVT.isInteger())) &&
"Non-integer argument type size mismatch");
if (ExpectedVT.bitsGT(ActualVT))
return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
if (ExpectedVT.bitsLT(ActualVT))
return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
return V;
}
SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
@@ -1505,9 +1511,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
"Outs and OutVals must be the same size");
// Declare the .params or .reg need to pass values
// to the function
for (const auto [ArgI, Arg] : llvm::enumerate(Args)) {
const auto ArgOuts = AllOuts.take_while(
[ArgI = ArgI](auto O) { return O.OrigArgIndex == ArgI; });
for (const auto E : llvm::enumerate(Args)) {
const auto ArgI = E.index();
const auto Arg = E.value();
const auto ArgOuts =
AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
AllOuts = AllOuts.drop_front(ArgOuts.size());
AllOutVals = AllOutVals.drop_front(ArgOuts.size());
@@ -1515,6 +1523,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const bool IsVAArg = (ArgI >= FirstVAArg);
const bool IsByVal = Arg.IsByVal;
const SDValue ParamSymbol =
getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
SmallVector<EVT, 16> VTs;
SmallVector<uint64_t, 16> Offsets;
@@ -1525,38 +1536,43 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert(VTs.size() == Offsets.size() && "Size mismatch");
assert((IsByVal || VTs.size() == ArgOuts.size()) && "Size mismatch");
Align ArgAlign;
const Align ArgAlign = [&]() {
if (IsByVal) {
// The ByValAlign in the Outs[OIdx].Flags is always set at this point,
// so we don't need to worry whether it's naturally aligned or not.
// See TargetLowering::LowerCallTo().
Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
InitialAlign, DL);
const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
const Align ByValAlign = getFunctionByValParamAlign(
CB->getCalledFunction(), ETy, InitialAlign, DL);
if (IsVAArg)
VAOffset = alignTo(VAOffset, ArgAlign);
} else {
ArgAlign = getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
VAOffset = alignTo(VAOffset, ByValAlign);
return ByValAlign;
}
return getArgumentAlignment(CB, Arg.Ty, ArgI + 1, DL);
}();
const unsigned TypeSize = DL.getTypeAllocSize(ETy);
assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) &&
"type size mismatch");
const bool PassAsArray = IsByVal || shouldPassAsArray(Arg.Ty);
const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> {
if (IsVAArg) {
if (ArgI == FirstVAArg) {
VADeclareParam = Chain =
DAG.getNode(NVPTXISD::DeclareParam, dl, {MVT::Other, MVT::Glue},
{Chain, GetI32(STI.getMaxRequiredAlignment()),
GetI32(ArgI), GetI32(1), InGlue});
VADeclareParam = DAG.getNode(
NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
{Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()),
GetI32(0), InGlue});
return VADeclareParam;
}
} else if (PassAsArray) {
return std::nullopt;
}
if (IsByVal || shouldPassAsArray(Arg.Ty)) {
// declare .param .align <align> .b8 .param<n>[<size>];
Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, {MVT::Other, MVT::Glue},
{Chain, GetI32(ArgAlign.value()), GetI32(ArgI),
return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
{MVT::Other, MVT::Glue},
{Chain, ParamSymbol, GetI32(ArgAlign.value()),
GetI32(TypeSize), InGlue});
} else {
}
assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
// declare .param .b<size> .param<n>;
@@ -1568,11 +1584,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
? promoteScalarArgumentSize(TypeSize * 8)
: TypeSize * 8;
Chain =
DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
{Chain, GetI32(ArgI), GetI32(PromotedSize), InGlue});
return DAG.getNode(NVPTXISD::DeclareScalarParam, dl,
{MVT::Other, MVT::Glue},
{Chain, ParamSymbol, GetI32(PromotedSize), InGlue});
}();
if (ArgDeclare) {
Chain = ArgDeclare->getValue(0);
InGlue = ArgDeclare->getValue(1);
}
InGlue = Chain.getValue(1);
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter
// than 32-bits are sign extended or zero extended, depending on
@@ -1594,8 +1613,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
} else {
StVal = ArgOutVals[I];
if (auto PromotedVT = PromoteScalarIntegerPTX(StVal.getValueType())) {
StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, *PromotedVT,
auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType());
if (PromotedVT != StVal.getValueType()) {
StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT,
StVal);
}
}
@@ -1619,12 +1639,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned J = 0;
for (const unsigned NumElts : VectorInfo) {
const int CurOffset = Offsets[J];
EVT EltVT = VTs[J];
EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
const Align PartAlign = commonAlignment(ArgAlign, CurOffset);
if (auto PromotedVT = PromoteScalarIntegerPTX(EltVT))
EltVT = *PromotedVT;
// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
// scalar store. In such cases, fall back to byte stores.
if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) {
@@ -1695,27 +1712,26 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
MaybeAlign RetAlign = std::nullopt;
// Handle Result
if (!Ins.empty()) {
RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
// Declare
// .param .align N .b8 retval0[<size-in-bytes>], or
// .param .b<size-in-bits> retval0
const SDValue RetDeclare = [&]() {
const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy);
if (!shouldPassAsArray(RetTy)) {
const unsigned PromotedResultSize = promoteScalarArgumentSize(ResultSize);
Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, {MVT::Other, MVT::Glue},
{Chain, GetI32(PromotedResultSize), InGlue});
InGlue = Chain.getValue(1);
} else {
Chain = DAG.getNode(
NVPTXISD::DeclareRetParam, dl, {MVT::Other, MVT::Glue},
{Chain, GetI32(RetAlign->value()), GetI32(ResultSize / 8), InGlue});
InGlue = Chain.getValue(1);
if (shouldPassAsArray(RetTy)) {
const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
return DAG.getNode(NVPTXISD::DeclareArrayParam, dl,
{MVT::Other, MVT::Glue},
{Chain, RetSymbol, GetI32(RetAlign.value()),
GetI32(ResultSize / 8), InGlue});
}
const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize);
return DAG.getNode(
NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
{Chain, RetSymbol, GetI32(PromotedResultSize), InGlue});
}();
Chain = RetDeclare.getValue(0);
InGlue = RetDeclare.getValue(1);
}
const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
@@ -1760,7 +1776,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// The prototype is embedded in a string and put as the operand for a
// CallPrototype SDNode which will print out to the value of the string.
std::string Proto =
getPrototype(DL, RetTy, Args, CLI.Outs, RetAlign,
getPrototype(DL, RetTy, Args, CLI.Outs,
HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB,
UniqueCallSite);
const char *ProtoStr = nvTM->getStrPool().save(Proto).data();
@@ -1773,11 +1789,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (ConvertToIndirectCall) {
// Copy the function ptr to a ptx register and use the register to call the
// function.
EVT DestVT = Callee.getValueType();
MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
const MVT DestVT = Callee.getValueType().getSimpleVT();
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned DestReg =
RegInfo.createVirtualRegister(TLI.getRegClassFor(DestVT.getSimpleVT()));
Register DestReg = MRI.createVirtualRegister(TLI.getRegClassFor(DestVT));
auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);
Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
}
@@ -1810,7 +1825,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
assert(VTs.size() == Ins.size() && "Bad value decomposition");
assert(RetAlign && "RetAlign is guaranteed to be set");
const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
// 32-bits are sign extended or zero extended, depending on whether
@@ -1818,17 +1833,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const bool ExtendIntegerRetVal =
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, *RetAlign);
const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
unsigned I = 0;
for (const unsigned VectorizedSize : VectorInfo) {
EVT TheLoadType = VTs[I];
EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]);
EVT EltType = Ins[I].VT;
const Align EltAlign = commonAlignment(*RetAlign, Offsets[I]);
const Align EltAlign = commonAlignment(RetAlign, Offsets[I]);
if (auto PromotedVT = PromoteScalarIntegerPTX(TheLoadType)) {
TheLoadType = *PromotedVT;
EltType = *PromotedVT;
}
if (TheLoadType != VTs[I])
EltType = TheLoadType;
if (ExtendIntegerRetVal) {
TheLoadType = MVT::i32;
@@ -1898,13 +1911,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
continue;
}
SDValue Ret = DAG.getNode(
NVPTXISD::ProxyReg, dl,
{ProxyRegOps[I].getSimpleValueType(), MVT::Other, MVT::Glue},
{Chain, ProxyRegOps[I], InGlue});
Chain = Ret.getValue(1);
InGlue = Ret.getValue(2);
SDValue Ret =
DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(),
{Chain, ProxyRegOps[I]});
const EVT ExpectedVT = Ins[I].VT;
if (!Ret.getValueType().bitsEq(ExpectedVT)) {
@@ -1914,14 +1923,10 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
for (SDValue &T : TempProxyRegOps) {
SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl,
{T.getSimpleValueType(), MVT::Other, MVT::Glue},
{Chain, T.getOperand(0), InGlue});
SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(),
{Chain, T.getOperand(0)});
DAG.ReplaceAllUsesWith(T, Repl);
DAG.RemoveDeadNode(T.getNode());
Chain = Repl.getValue(1);
InGlue = Repl.getValue(2);
}
// set isTailCall to false for now, until we figure out how to express
@@ -3293,11 +3298,17 @@ bool NVPTXTargetLowering::splitValueIntoRegisterParts(
// Name of the symbol is composed from its index and the function name.
// Negative index corresponds to special parameter (unsized array) used for
// passing variable arguments.
SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
EVT v) const {
SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
EVT T) const {
StringRef SavedStr = nvTM->getStrPool().save(
getParamName(&DAG.getMachineFunction().getFunction(), idx));
return DAG.getExternalSymbol(SavedStr.data(), v);
getParamName(&DAG.getMachineFunction().getFunction(), I));
return DAG.getExternalSymbol(SavedStr.data(), T);
}
SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
EVT T) const {
const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
return DAG.getExternalSymbol(SavedStr.data(), T);
}
SDValue NVPTXTargetLowering::LowerFormalArguments(
@@ -3394,8 +3405,11 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
const unsigned PackingAmt =
LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
const EVT VecVT = EVT::getVectorVT(
F->getContext(), LoadVT.getScalarType(), NumElts * PackingAmt);
const EVT VecVT =
NumElts == 1
? LoadVT
: EVT::getVectorVT(F->getContext(), LoadVT.getScalarType(),
NumElts * PackingAmt);
SDValue VecAddr = DAG.getObjectPtrOffset(
dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
@@ -3409,22 +3423,16 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
if (P.getNode())
P.getNode()->setIROrder(Arg.getArgNo() + 1);
for (const unsigned J : llvm::seq(NumElts)) {
SDValue Elt = DAG.getNode(
LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
SDValue Elt =
NumElts == 1
? P
: DAG.getNode(LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR
: ISD::EXTRACT_VECTOR_ELT,
dl, LoadVT, P, DAG.getVectorIdxConstant(J * PackingAmt, dl));
dl, LoadVT, P,
DAG.getVectorIdxConstant(J * PackingAmt, dl));
// Extend or truncate the element if necessary (e.g. an i8 is loaded
// into an i16 register)
const EVT ExpectedVT = ArgIns[I + J].VT;
assert((Elt.getValueType() == ExpectedVT ||
(ExpectedVT.isInteger() && Elt.getValueType().isInteger())) &&
"Non-integer argument type size mismatch");
if (ExpectedVT.bitsGT(Elt.getValueType()))
Elt = DAG.getNode(getExtOpcode(ArgIns[I + J].Flags), dl, ExpectedVT,
Elt);
else if (ExpectedVT.bitsLT(Elt.getValueType()))
Elt = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Elt);
Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
DAG, dl);
InVals.push_back(Elt);
}
I += NumElts;
@@ -3467,25 +3475,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
const auto GetRetVal = [&](unsigned I) -> SDValue {
SDValue RetVal = OutVals[I];
assert(!PromoteScalarIntegerPTX(RetVal.getValueType()) &&
assert(promoteScalarIntegerPTX(RetVal.getValueType()) ==
RetVal.getValueType() &&
"OutVal type should always be legal");
EVT VTI = VTs[I];
if (const auto PromotedVT = PromoteScalarIntegerPTX(VTI))
VTI = *PromotedVT;
const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
const EVT StoreVT =
ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
assert((RetVal.getValueType() == StoreVT ||
(StoreVT.isInteger() && RetVal.getValueType().isInteger())) &&
"Non-integer argument type size mismatch");
if (StoreVT.bitsGT(RetVal.getValueType())) {
RetVal = DAG.getNode(getExtOpcode(Outs[I].Flags), dl, StoreVT, RetVal);
} else if (StoreVT.bitsLT(RetVal.getValueType())) {
RetVal = DAG.getNode(ISD::TRUNCATE, dl, StoreVT, RetVal);
}
return RetVal;
return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
};
const auto RetAlign = getFunctionParamOptimizedAlign(&F, RetTy, DL);
@@ -3500,7 +3497,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
if (NumElts == 1) {
Val = GetRetVal(I);
} else {
SmallVector<SDValue, 6> StoreVals;
SmallVector<SDValue, 4> StoreVals;
for (const unsigned J : llvm::seq(NumElts)) {
SDValue ValJ = GetRetVal(I + J);
if (ValJ.getValueType().isVector())
@@ -3514,7 +3511,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
Val = DAG.getBuildVector(VT, dl, StoreVals);
}
SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
SDValue Ptr =
DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));

View File

@@ -25,10 +25,15 @@ enum NodeType : unsigned {
// Start the numbering from where ISD NodeType finishes.
FIRST_NUMBER = ISD::BUILTIN_OP_END,
RET_GLUE,
DeclareParam,
/// These nodes represent a parameter declaration. In PTX this will look like:
/// .param .align 16 .b8 param0[1024];
/// .param .b32 retval0;
///
/// DeclareArrayParam(Chain, Externalsym, Align, Size, Glue)
/// DeclareScalarParam(Chain, Externalsym, Size, Glue)
DeclareScalarParam,
DeclareRetParam,
DeclareRet,
DeclareArrayParam,
/// This node represents a PTX call instruction. It's operands are as follows:
///
@@ -174,7 +179,6 @@ public:
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
const SmallVectorImpl<ISD::OutputArg> &,
MaybeAlign RetAlign,
std::optional<unsigned> FirstVAArg,
const CallBase &CB, unsigned UniqueCallSite) const;
@@ -272,8 +276,8 @@ private:
const NVPTXSubtarget &STI; // cache the subtarget here
mutable unsigned GlobalUniqueCallSite;
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
SDValue getParamSymbol(SelectionDAG &DAG, int I, EVT T) const;
SDValue getCallParamSymbol(SelectionDAG &DAG, int I, EVT T) const;
SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;

View File

@@ -1990,9 +1990,9 @@ defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
def SDTDeclareParamProfile :
def SDTDeclareArrayParam :
SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
def SDTDeclareScalarParamProfile :
def SDTDeclareScalarParam :
SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
@@ -2001,22 +2001,17 @@ def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
def SDTProxyRegProfile : SDTypeProfile<1, 1, []>;
def DeclareParam :
SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>;
def declare_array_param :
SDNode<"NVPTXISD::DeclareArrayParam", SDTDeclareArrayParam,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def DeclareScalarParam :
SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def DeclareRetParam :
SDNode<"NVPTXISD::DeclareRetParam",
SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def DeclareRet :
SDNode<"NVPTXISD::DeclareRet",
SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>,
def declare_scalar_param :
SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def LoadParam :
SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
[SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
@@ -2037,9 +2032,8 @@ def StoreParamV4 :
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def MoveParam :
SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
def ProxyReg :
SDNode<"NVPTXISD::ProxyReg", SDTProxyRegProfile,
[SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
def proxy_reg :
SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>;
/// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
/// NumParams, Callee, Proto, InGlue)
@@ -2188,23 +2182,17 @@ defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">;
defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">;
def DeclareRetMemInst :
NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size),
".param .align $align .b8 retval0[$size];",
[(DeclareRetParam imm:$align, imm:$size)]>;
def DeclareRetScalarInst :
NVPTXInst<(outs), (ins i32imm:$size),
".param .b$size retval0;",
[(DeclareRet imm:$size)]>;
def DeclareParamInst :
NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
".param .align $align .b8 param$a[$size];",
[(DeclareParam imm:$align, imm:$a, imm:$size)]>;
def DeclareScalarParamInst :
def DECLARE_PARAM_array :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size),
".param .align $align .b8 \t$a[$size];", []>;
def DECLARE_PARAM_scalar :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
".param .b$size param$a;",
[(DeclareScalarParam imm:$a, imm:$size)]>;
".param .b$size \t$a;", []>;
def : Pat<(declare_array_param externalsym:$a, imm:$align, imm:$size),
(DECLARE_PARAM_array (to_texternsym $a), imm:$align, imm:$size)>;
def : Pat<(declare_scalar_param externalsym:$a, imm:$size),
(DECLARE_PARAM_scalar (to_texternsym $a), imm:$size)>;
foreach t = [I32RT, I64RT] in {
defvar inst_name = "MOV" # t.Size # "_PARAM";
@@ -2217,7 +2205,7 @@ multiclass ProxyRegInst<string SzStr, NVPTXRegClass rc> {
def NAME : BasicNVPTXInst<(outs rc:$dst), (ins rc:$src),
"mov." # SzStr>;
foreach vt = rc.RegTypes in
def : Pat<(vt (ProxyReg vt:$src)), (!cast<NVPTXInst>(NAME) $src)>;
def : Pat<(vt (proxy_reg vt:$src)), (!cast<NVPTXInst>(NAME) $src)>;
}
defm ProxyRegB1 : ProxyRegInst<"pred", B1>;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -21,17 +21,17 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
; SM30-NEXT: and.b32 %r10, %r9, 3;
; SM30-NEXT: shl.b32 %r1, %r10, 3;
; SM30-NEXT: mov.b32 %r11, 255;
; SM30-NEXT: shl.b32 %r12, %r11, %r1;
; SM30-NEXT: not.b32 %r2, %r12;
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
; SM30-NEXT: and.b32 %r14, %r13, 255;
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
; SM30-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1];
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
; SM30-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1];
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
; SM30-NEXT: and.b32 %r11, %r10, 3;
; SM30-NEXT: shl.b32 %r1, %r11, 3;
; SM30-NEXT: mov.b32 %r12, 255;
; SM30-NEXT: shl.b32 %r13, %r12, %r1;
; SM30-NEXT: not.b32 %r2, %r13;
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
; SM30-NEXT: and.b32 %r15, %r14, 255;
; SM30-NEXT: shl.b32 %r3, %r15, %r1;
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
; SM30-NEXT: ld.b32 %r16, [%rd1];
; SM30-NEXT: and.b32 %r20, %r16, %r2;
; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
@@ -48,7 +48,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: mov.b32 %r20, %r8;
; SM30-NEXT: @%p2 bra $L__BB0_1;
; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end
; SM30-NEXT: st.param.b32 [func_retval0], %r13;
; SM30-NEXT: st.param.b32 [func_retval0], %r14;
; SM30-NEXT: ret;
;
; SM70-LABEL: relaxed_sys_i8(
@@ -62,17 +62,17 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
; SM70-NEXT: shl.b32 %r1, %r10, 3;
; SM70-NEXT: mov.b32 %r11, 255;
; SM70-NEXT: shl.b32 %r12, %r11, %r1;
; SM70-NEXT: not.b32 %r2, %r12;
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
; SM70-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.param.b8 %r9, [relaxed_sys_i8_param_1];
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
; SM70-NEXT: and.b32 %r11, %r10, 3;
; SM70-NEXT: shl.b32 %r1, %r11, 3;
; SM70-NEXT: mov.b32 %r12, 255;
; SM70-NEXT: shl.b32 %r13, %r12, %r1;
; SM70-NEXT: not.b32 %r2, %r13;
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
; SM70-NEXT: and.b32 %r15, %r14, 255;
; SM70-NEXT: shl.b32 %r3, %r15, %r1;
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
; SM70-NEXT: ld.b32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
@@ -89,7 +89,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.b32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB0_1;
; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
; SM90-LABEL: relaxed_sys_i8(
; SM90: {
@@ -147,17 +147,17 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2];
; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0];
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
; SM30-NEXT: and.b32 %r10, %r9, 3;
; SM30-NEXT: shl.b32 %r1, %r10, 3;
; SM30-NEXT: mov.b32 %r11, 255;
; SM30-NEXT: shl.b32 %r12, %r11, %r1;
; SM30-NEXT: not.b32 %r2, %r12;
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
; SM30-NEXT: and.b32 %r14, %r13, 255;
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
; SM30-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1];
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
; SM30-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1];
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
; SM30-NEXT: and.b32 %r11, %r10, 3;
; SM30-NEXT: shl.b32 %r1, %r11, 3;
; SM30-NEXT: mov.b32 %r12, 255;
; SM30-NEXT: shl.b32 %r13, %r12, %r1;
; SM30-NEXT: not.b32 %r2, %r13;
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
; SM30-NEXT: and.b32 %r15, %r14, 255;
; SM30-NEXT: shl.b32 %r3, %r15, %r1;
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
; SM30-NEXT: ld.b32 %r16, [%rd1];
; SM30-NEXT: and.b32 %r20, %r16, %r2;
; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
@@ -175,7 +175,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: @%p2 bra $L__BB1_1;
; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end
; SM30-NEXT: membar.sys;
; SM30-NEXT: st.param.b32 [func_retval0], %r13;
; SM30-NEXT: st.param.b32 [func_retval0], %r14;
; SM30-NEXT: ret;
;
; SM70-LABEL: acquire_sys_i8(
@@ -189,17 +189,17 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2];
; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
; SM70-NEXT: shl.b32 %r1, %r10, 3;
; SM70-NEXT: mov.b32 %r11, 255;
; SM70-NEXT: shl.b32 %r12, %r11, %r1;
; SM70-NEXT: not.b32 %r2, %r12;
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
; SM70-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.param.b8 %r9, [acquire_sys_i8_param_1];
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
; SM70-NEXT: and.b32 %r11, %r10, 3;
; SM70-NEXT: shl.b32 %r1, %r11, 3;
; SM70-NEXT: mov.b32 %r12, 255;
; SM70-NEXT: shl.b32 %r13, %r12, %r1;
; SM70-NEXT: not.b32 %r2, %r13;
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
; SM70-NEXT: and.b32 %r15, %r14, 255;
; SM70-NEXT: shl.b32 %r3, %r15, %r1;
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
; SM70-NEXT: ld.b32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
@@ -217,7 +217,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: @%p2 bra $L__BB1_1;
; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
; SM90-LABEL: acquire_sys_i8(
; SM90: {
@@ -276,18 +276,18 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2];
; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0];
; SM30-NEXT: membar.sys;
; SM30-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1];
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
; SM30-NEXT: and.b32 %r10, %r9, 3;
; SM30-NEXT: shl.b32 %r1, %r10, 3;
; SM30-NEXT: mov.b32 %r11, 255;
; SM30-NEXT: shl.b32 %r12, %r11, %r1;
; SM30-NEXT: not.b32 %r2, %r12;
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
; SM30-NEXT: and.b32 %r14, %r13, 255;
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
; SM30-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1];
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
; SM30-NEXT: and.b32 %r11, %r10, 3;
; SM30-NEXT: shl.b32 %r1, %r11, 3;
; SM30-NEXT: mov.b32 %r12, 255;
; SM30-NEXT: shl.b32 %r13, %r12, %r1;
; SM30-NEXT: not.b32 %r2, %r13;
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
; SM30-NEXT: and.b32 %r15, %r14, 255;
; SM30-NEXT: shl.b32 %r3, %r15, %r1;
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
; SM30-NEXT: ld.b32 %r16, [%rd1];
; SM30-NEXT: and.b32 %r20, %r16, %r2;
; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
@@ -304,7 +304,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: mov.b32 %r20, %r8;
; SM30-NEXT: @%p2 bra $L__BB2_1;
; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end
; SM30-NEXT: st.param.b32 [func_retval0], %r13;
; SM30-NEXT: st.param.b32 [func_retval0], %r14;
; SM30-NEXT: ret;
;
; SM70-LABEL: release_sys_i8(
@@ -318,18 +318,18 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2];
; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0];
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: ld.param.b8 %r9, [release_sys_i8_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
; SM70-NEXT: shl.b32 %r1, %r10, 3;
; SM70-NEXT: mov.b32 %r11, 255;
; SM70-NEXT: shl.b32 %r12, %r11, %r1;
; SM70-NEXT: not.b32 %r2, %r12;
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
; SM70-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
; SM70-NEXT: and.b32 %r11, %r10, 3;
; SM70-NEXT: shl.b32 %r1, %r11, 3;
; SM70-NEXT: mov.b32 %r12, 255;
; SM70-NEXT: shl.b32 %r13, %r12, %r1;
; SM70-NEXT: not.b32 %r2, %r13;
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
; SM70-NEXT: and.b32 %r15, %r14, 255;
; SM70-NEXT: shl.b32 %r3, %r15, %r1;
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
; SM70-NEXT: ld.b32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
@@ -346,7 +346,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.b32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB2_1;
; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
; SM90-LABEL: release_sys_i8(
; SM90: {
@@ -405,18 +405,18 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
; SM30-NEXT: membar.sys;
; SM30-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1];
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
; SM30-NEXT: and.b32 %r10, %r9, 3;
; SM30-NEXT: shl.b32 %r1, %r10, 3;
; SM30-NEXT: mov.b32 %r11, 255;
; SM30-NEXT: shl.b32 %r12, %r11, %r1;
; SM30-NEXT: not.b32 %r2, %r12;
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
; SM30-NEXT: and.b32 %r14, %r13, 255;
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
; SM30-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
; SM30-NEXT: and.b32 %r11, %r10, 3;
; SM30-NEXT: shl.b32 %r1, %r11, 3;
; SM30-NEXT: mov.b32 %r12, 255;
; SM30-NEXT: shl.b32 %r13, %r12, %r1;
; SM30-NEXT: not.b32 %r2, %r13;
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
; SM30-NEXT: and.b32 %r15, %r14, 255;
; SM30-NEXT: shl.b32 %r3, %r15, %r1;
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
; SM30-NEXT: ld.b32 %r16, [%rd1];
; SM30-NEXT: and.b32 %r20, %r16, %r2;
; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
@@ -434,7 +434,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: @%p2 bra $L__BB3_1;
; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end
; SM30-NEXT: membar.sys;
; SM30-NEXT: st.param.b32 [func_retval0], %r13;
; SM30-NEXT: st.param.b32 [func_retval0], %r14;
; SM30-NEXT: ret;
;
; SM70-LABEL: acq_rel_sys_i8(
@@ -448,18 +448,18 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: ld.param.b8 %r9, [acq_rel_sys_i8_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
; SM70-NEXT: shl.b32 %r1, %r10, 3;
; SM70-NEXT: mov.b32 %r11, 255;
; SM70-NEXT: shl.b32 %r12, %r11, %r1;
; SM70-NEXT: not.b32 %r2, %r12;
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
; SM70-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
; SM70-NEXT: and.b32 %r11, %r10, 3;
; SM70-NEXT: shl.b32 %r1, %r11, 3;
; SM70-NEXT: mov.b32 %r12, 255;
; SM70-NEXT: shl.b32 %r13, %r12, %r1;
; SM70-NEXT: not.b32 %r2, %r13;
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
; SM70-NEXT: and.b32 %r15, %r14, 255;
; SM70-NEXT: shl.b32 %r3, %r15, %r1;
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
; SM70-NEXT: ld.b32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
@@ -477,7 +477,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: @%p2 bra $L__BB3_1;
; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
; SM90-LABEL: acq_rel_sys_i8(
; SM90: {
@@ -537,18 +537,18 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
; SM30-NEXT: membar.sys;
; SM30-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1];
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
; SM30-NEXT: and.b32 %r10, %r9, 3;
; SM30-NEXT: shl.b32 %r1, %r10, 3;
; SM30-NEXT: mov.b32 %r11, 255;
; SM30-NEXT: shl.b32 %r12, %r11, %r1;
; SM30-NEXT: not.b32 %r2, %r12;
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
; SM30-NEXT: and.b32 %r14, %r13, 255;
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
; SM30-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
; SM30-NEXT: and.b32 %r11, %r10, 3;
; SM30-NEXT: shl.b32 %r1, %r11, 3;
; SM30-NEXT: mov.b32 %r12, 255;
; SM30-NEXT: shl.b32 %r13, %r12, %r1;
; SM30-NEXT: not.b32 %r2, %r13;
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
; SM30-NEXT: and.b32 %r15, %r14, 255;
; SM30-NEXT: shl.b32 %r3, %r15, %r1;
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
; SM30-NEXT: ld.b32 %r16, [%rd1];
; SM30-NEXT: and.b32 %r20, %r16, %r2;
; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
@@ -566,7 +566,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: @%p2 bra $L__BB4_1;
; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end
; SM30-NEXT: membar.sys;
; SM30-NEXT: st.param.b32 [func_retval0], %r13;
; SM30-NEXT: st.param.b32 [func_retval0], %r14;
; SM30-NEXT: ret;
;
; SM70-LABEL: seq_cst_sys_i8(
@@ -580,18 +580,18 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: ld.param.b8 %r9, [seq_cst_sys_i8_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
; SM70-NEXT: shl.b32 %r1, %r10, 3;
; SM70-NEXT: mov.b32 %r11, 255;
; SM70-NEXT: shl.b32 %r12, %r11, %r1;
; SM70-NEXT: not.b32 %r2, %r12;
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
; SM70-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
; SM70-NEXT: and.b32 %r11, %r10, 3;
; SM70-NEXT: shl.b32 %r1, %r11, 3;
; SM70-NEXT: mov.b32 %r12, 255;
; SM70-NEXT: shl.b32 %r13, %r12, %r1;
; SM70-NEXT: not.b32 %r2, %r13;
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
; SM70-NEXT: and.b32 %r15, %r14, 255;
; SM70-NEXT: shl.b32 %r3, %r15, %r1;
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
; SM70-NEXT: ld.b32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
@@ -609,7 +609,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: @%p2 bra $L__BB4_1;
; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
; SM90-LABEL: seq_cst_sys_i8(
; SM90: {

View File

@@ -24,11 +24,11 @@ define i16 @cvt_i16_i32(i32 %x) {
define i16 @cvt_i16_i64(i64 %x) {
; CHECK-LABEL: cvt_i16_i64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [cvt_i16_i64_param_0];
; CHECK-NEXT: st.param.b32 [func_retval0], %rd1;
; CHECK-NEXT: ld.param.b16 %r1, [cvt_i16_i64_param_0];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%a = trunc i64 %x to i16
ret i16 %a

View File

@@ -158,27 +158,24 @@ define i16 @test_v8i8(i64 %a) {
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<16>;
; CHECK-NEXT: .reg .b32 %r<12>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_v8i8_param_0];
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v8i8_param_0];
; CHECK-NEXT: bfe.s32 %r3, %r1, 0, 8;
; CHECK-NEXT: cvt.s8.s32 %rs1, %r3;
; CHECK-NEXT: bfe.s32 %r4, %r2, 8, 8;
; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
; CHECK-NEXT: cvt.s8.s32 %rs2, %r4;
; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8;
; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
; CHECK-NEXT: cvt.s8.s32 %rs3, %r5;
; CHECK-NEXT: bfe.s32 %r6, %r2, 24, 8;
; CHECK-NEXT: bfe.s32 %r6, %r1, 24, 8;
; CHECK-NEXT: cvt.s8.s32 %rs4, %r6;
; CHECK-NEXT: bfe.s32 %r7, %r1, 0, 8;
; CHECK-NEXT: bfe.s32 %r7, %r2, 0, 8;
; CHECK-NEXT: cvt.s8.s32 %rs5, %r7;
; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8;
; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
; CHECK-NEXT: cvt.s8.s32 %rs6, %r8;
; CHECK-NEXT: bfe.s32 %r9, %r1, 16, 8;
; CHECK-NEXT: bfe.s32 %r9, %r2, 16, 8;
; CHECK-NEXT: cvt.s8.s32 %rs7, %r9;
; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
; CHECK-NEXT: bfe.s32 %r10, %r2, 24, 8;
; CHECK-NEXT: cvt.s8.s32 %rs8, %r10;
; CHECK-NEXT: add.s16 %rs9, %rs1, %rs2;
; CHECK-NEXT: add.s16 %rs10, %rs3, %rs4;

View File

@@ -121,7 +121,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-LABEL: grid_const_escape(
; PTX: {
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b32 %r<2>;
; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
@@ -153,7 +153,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
; PTX-NEXT: .local .align 4 .b8 __local_depot4[4];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
@@ -255,7 +255,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escape(
; PTX: {
; PTX-NEXT: .reg .b32 %r<5>;
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
@@ -295,7 +295,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escapemem(
; PTX: {
; PTX-NEXT: .reg .b32 %r<6>;
; PTX-NEXT: .reg .b32 %r<5>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:

View File

@@ -31,7 +31,7 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %
; PTX-LABEL: load_alignment(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.b64 %rd1, load_alignment_param_0;
@@ -76,7 +76,7 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
;
; PTX-LABEL: load_padding(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, load_padding_param_0;

View File

@@ -8,7 +8,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-LABEL: wombat(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<11>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-NEXT: .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %bb
; CHECK-NEXT: ld.param.b32 %r4, [wombat_param_2];
@@ -27,11 +27,11 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3;
; CHECK-NEXT: or.b32 %r8, %r4, %r7;
; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8;
; CHECK-NEXT: cvt.rn.f64.s32 %rd3, %r9;
; CHECK-NEXT: cvt.rn.f64.u32 %rd4, %r10;
; CHECK-NEXT: add.rn.f64 %rd5, %rd4, %rd3;
; CHECK-NEXT: mov.b64 %rd6, 0;
; CHECK-NEXT: st.global.b64 [%rd6], %rd5;
; CHECK-NEXT: cvt.rn.f64.s32 %rd2, %r9;
; CHECK-NEXT: cvt.rn.f64.u32 %rd3, %r10;
; CHECK-NEXT: add.rn.f64 %rd4, %rd3, %rd2;
; CHECK-NEXT: mov.b64 %rd5, 0;
; CHECK-NEXT: st.global.b64 [%rd5], %rd4;
; CHECK-NEXT: mov.b32 %r10, 1;
; CHECK-NEXT: bra.uni $L__BB0_1;
bb:

View File

@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=nvptx64 -verify-machineinstrs < %s | FileCheck %s
; RUN: %if ptxas %{ llc -mtriple=nvptx64 -verify-machineinstrs < %s | %ptxas-verify %}
@@ -10,9 +11,19 @@
; value will be identical regardless of the boolean representation.
; Check that the optimization triggers in this case.
; CHECK-LABEL: @pow2_mask_cmp
; CHECK: bfe.u32 {{%r[0-9]+}}, {{%r[0-9]+}}, 3, 1
define i32 @pow2_mask_cmp(i32 %x) {
; CHECK-LABEL: pow2_mask_cmp(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [pow2_mask_cmp_param_0];
; CHECK-NEXT: shr.u16 %rs2, %rs1, 3;
; CHECK-NEXT: cvt.u32.u16 %r1, %rs2;
; CHECK-NEXT: and.b32 %r2, %r1, 1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%a = and i32 %x, 8
%cmp = icmp ne i32 %a, 0
%r = zext i1 %cmp to i32

View File

@@ -445,12 +445,12 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) {
; CHECK-NEXT: .reg .b16 %rs<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_0];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_2];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irrr_param_1];
; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_2];
; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_0];
; CHECK-NEXT: { // callseq 24, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, %rs2, %rs3};
; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs3, %rs2, %rs1};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 24
; CHECK-NEXT: ret;
@@ -467,12 +467,12 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) {
; CHECK-NEXT: .reg .b16 %rs<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_0];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_2];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rirr_param_1];
; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_2];
; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_0];
; CHECK-NEXT: { // callseq 25, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, %rs2, %rs3};
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, 2, %rs2, %rs1};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 25
; CHECK-NEXT: ret;
@@ -489,12 +489,12 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) {
; CHECK-NEXT: .reg .b16 %rs<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_0];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_2];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrir_param_1];
; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_2];
; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_0];
; CHECK-NEXT: { // callseq 26, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, 3, %rs3};
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, %rs2, 3, %rs1};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 26
; CHECK-NEXT: ret;
@@ -511,12 +511,12 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) {
; CHECK-NEXT: .reg .b16 %rs<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_0];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_2];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrri_param_1];
; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_2];
; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_0];
; CHECK-NEXT: { // callseq 27, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, %rs3, 4};
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, %rs2, %rs1, 4};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 27
; CHECK-NEXT: ret;
@@ -533,11 +533,11 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_0];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_1];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_1];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_0];
; CHECK-NEXT: { // callseq 28, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs1, %rs2};
; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs2, %rs1};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 28
; CHECK-NEXT: ret;
@@ -554,11 +554,11 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irir_param_0];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irir_param_1];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irir_param_1];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irir_param_0];
; CHECK-NEXT: { // callseq 29, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, 3, %rs2};
; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs2, 3, %rs1};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 29
; CHECK-NEXT: ret;
@@ -575,11 +575,11 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irri_param_0];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irri_param_1];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irri_param_1];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irri_param_0];
; CHECK-NEXT: { // callseq 30, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, %rs2, 4};
; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs2, %rs1, 4};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 30
; CHECK-NEXT: ret;
@@ -596,11 +596,11 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riir_param_0];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riir_param_1];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riir_param_1];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riir_param_0];
; CHECK-NEXT: { // callseq 31, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, 3, %rs2};
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, 2, 3, %rs1};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 31
; CHECK-NEXT: ret;
@@ -617,11 +617,11 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riri_param_0];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riri_param_1];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riri_param_1];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riri_param_0];
; CHECK-NEXT: { // callseq 32, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, %rs2, 4};
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, 2, %rs1, 4};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 32
; CHECK-NEXT: ret;
@@ -638,11 +638,11 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_0];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_1];
; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_1];
; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_0];
; CHECK-NEXT: { // callseq 33, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, 3, 4};
; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, %rs1, 3, 4};
; CHECK-NEXT: call.uni call_v4_i8, (param0);
; CHECK-NEXT: } // callseq 33
; CHECK-NEXT: ret;

View File

@@ -103,16 +103,16 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8];
; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16];
; CHECK-NEXT: } // callseq 1
; CHECK-NEXT: cvt.u32.u16 %r16, %rs3;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r18, %rs5;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs6;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs3;
; CHECK-NEXT: cvt.u32.u16 %r16, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs5;
; CHECK-NEXT: cvt.u32.u16 %r18, %rs6;
; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
; CHECK-NEXT: st.param.b8 [func_retval0+8], %r19;
; CHECK-NEXT: st.param.b8 [func_retval0+7], %r18;
; CHECK-NEXT: st.param.b8 [func_retval0+6], %r17;
; CHECK-NEXT: st.param.b8 [func_retval0+5], %r16;
; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18;
; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17;
; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16;
; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15;
; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2;
; CHECK-NEXT: ret;
%r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
@@ -185,48 +185,48 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
; CHECK-NEXT: ld.param.b8 %rs10, [retval0+16];
; CHECK-NEXT: ld.param.b64 %rd32, [retval0+24];
; CHECK-NEXT: } // callseq 2
; CHECK-NEXT: cvt.u64.u16 %rd35, %rs3;
; CHECK-NEXT: cvt.u64.u16 %rd33, %rs3;
; CHECK-NEXT: and.b64 %rd34, %rd33, 255;
; CHECK-NEXT: cvt.u64.u16 %rd35, %rs4;
; CHECK-NEXT: and.b64 %rd36, %rd35, 255;
; CHECK-NEXT: cvt.u64.u16 %rd37, %rs4;
; CHECK-NEXT: and.b64 %rd38, %rd37, 255;
; CHECK-NEXT: shl.b64 %rd39, %rd38, 8;
; CHECK-NEXT: or.b64 %rd40, %rd36, %rd39;
; CHECK-NEXT: cvt.u64.u16 %rd41, %rs5;
; CHECK-NEXT: and.b64 %rd42, %rd41, 255;
; CHECK-NEXT: shl.b64 %rd43, %rd42, 16;
; CHECK-NEXT: or.b64 %rd44, %rd40, %rd43;
; CHECK-NEXT: cvt.u64.u16 %rd45, %rs6;
; CHECK-NEXT: and.b64 %rd46, %rd45, 255;
; CHECK-NEXT: shl.b64 %rd47, %rd46, 24;
; CHECK-NEXT: or.b64 %rd48, %rd44, %rd47;
; CHECK-NEXT: cvt.u64.u16 %rd49, %rs7;
; CHECK-NEXT: and.b64 %rd50, %rd49, 255;
; CHECK-NEXT: shl.b64 %rd51, %rd50, 32;
; CHECK-NEXT: or.b64 %rd52, %rd48, %rd51;
; CHECK-NEXT: cvt.u64.u16 %rd53, %rs8;
; CHECK-NEXT: and.b64 %rd54, %rd53, 255;
; CHECK-NEXT: shl.b64 %rd55, %rd54, 40;
; CHECK-NEXT: or.b64 %rd56, %rd52, %rd55;
; CHECK-NEXT: cvt.u64.u16 %rd57, %rs9;
; CHECK-NEXT: and.b64 %rd58, %rd57, 255;
; CHECK-NEXT: shl.b64 %rd59, %rd58, 48;
; CHECK-NEXT: or.b64 %rd60, %rd56, %rd59;
; CHECK-NEXT: cvt.u64.u16 %rd61, %rs10;
; CHECK-NEXT: shl.b64 %rd62, %rd61, 56;
; CHECK-NEXT: or.b64 %rd63, %rd60, %rd62;
; CHECK-NEXT: shl.b64 %rd37, %rd36, 8;
; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37;
; CHECK-NEXT: cvt.u64.u16 %rd39, %rs5;
; CHECK-NEXT: and.b64 %rd40, %rd39, 255;
; CHECK-NEXT: shl.b64 %rd41, %rd40, 16;
; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41;
; CHECK-NEXT: cvt.u64.u16 %rd43, %rs6;
; CHECK-NEXT: and.b64 %rd44, %rd43, 255;
; CHECK-NEXT: shl.b64 %rd45, %rd44, 24;
; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45;
; CHECK-NEXT: cvt.u64.u16 %rd47, %rs7;
; CHECK-NEXT: and.b64 %rd48, %rd47, 255;
; CHECK-NEXT: shl.b64 %rd49, %rd48, 32;
; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49;
; CHECK-NEXT: cvt.u64.u16 %rd51, %rs8;
; CHECK-NEXT: and.b64 %rd52, %rd51, 255;
; CHECK-NEXT: shl.b64 %rd53, %rd52, 40;
; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53;
; CHECK-NEXT: cvt.u64.u16 %rd55, %rs9;
; CHECK-NEXT: and.b64 %rd56, %rd55, 255;
; CHECK-NEXT: shl.b64 %rd57, %rd56, 48;
; CHECK-NEXT: or.b64 %rd58, %rd54, %rd57;
; CHECK-NEXT: cvt.u64.u16 %rd59, %rs10;
; CHECK-NEXT: shl.b64 %rd60, %rd59, 56;
; CHECK-NEXT: or.b64 %rd61, %rd58, %rd60;
; CHECK-NEXT: st.param.b64 [func_retval0], %rd31;
; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2;
; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd45;
; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd41;
; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd37;
; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd35;
; CHECK-NEXT: shr.u64 %rd64, %rd52, 32;
; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43;
; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd39;
; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35;
; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd33;
; CHECK-NEXT: shr.u64 %rd64, %rd50, 32;
; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd64;
; CHECK-NEXT: shr.u64 %rd65, %rd56, 40;
; CHECK-NEXT: shr.u64 %rd65, %rd54, 40;
; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd65;
; CHECK-NEXT: shr.u64 %rd66, %rd60, 48;
; CHECK-NEXT: shr.u64 %rd66, %rd58, 48;
; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd66;
; CHECK-NEXT: shr.u64 %rd67, %rd63, 56;
; CHECK-NEXT: shr.u64 %rd67, %rd61, 56;
; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd67;
; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd32;
; CHECK-NEXT: ret;
@@ -317,16 +317,16 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8];
; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16];
; CHECK-NEXT: } // callseq 4
; CHECK-NEXT: cvt.u32.u16 %r16, %rs3;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r18, %rs5;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs6;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs3;
; CHECK-NEXT: cvt.u32.u16 %r16, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs5;
; CHECK-NEXT: cvt.u32.u16 %r18, %rs6;
; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
; CHECK-NEXT: st.param.b8 [func_retval0+8], %r19;
; CHECK-NEXT: st.param.b8 [func_retval0+7], %r18;
; CHECK-NEXT: st.param.b8 [func_retval0+6], %r17;
; CHECK-NEXT: st.param.b8 [func_retval0+5], %r16;
; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18;
; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17;
; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16;
; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15;
; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2;
; CHECK-NEXT: ret;
%r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
@@ -376,16 +376,16 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8];
; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16];
; CHECK-NEXT: } // callseq 5
; CHECK-NEXT: cvt.u32.u16 %r16, %rs3;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r18, %rs5;
; CHECK-NEXT: cvt.u32.u16 %r19, %rs6;
; CHECK-NEXT: cvt.u32.u16 %r15, %rs3;
; CHECK-NEXT: cvt.u32.u16 %r16, %rs4;
; CHECK-NEXT: cvt.u32.u16 %r17, %rs5;
; CHECK-NEXT: cvt.u32.u16 %r18, %rs6;
; CHECK-NEXT: st.param.b32 [func_retval0], %r14;
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2;
; CHECK-NEXT: st.param.b8 [func_retval0+8], %r19;
; CHECK-NEXT: st.param.b8 [func_retval0+7], %r18;
; CHECK-NEXT: st.param.b8 [func_retval0+6], %r17;
; CHECK-NEXT: st.param.b8 [func_retval0+5], %r16;
; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18;
; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17;
; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16;
; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15;
; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2;
; CHECK-NEXT: ret;
%r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
@@ -458,48 +458,48 @@ define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
; CHECK-NEXT: ld.param.b8 %rs10, [retval0+16];
; CHECK-NEXT: ld.param.b64 %rd32, [retval0+24];
; CHECK-NEXT: } // callseq 6
; CHECK-NEXT: cvt.u64.u16 %rd35, %rs3;
; CHECK-NEXT: cvt.u64.u16 %rd33, %rs3;
; CHECK-NEXT: and.b64 %rd34, %rd33, 255;
; CHECK-NEXT: cvt.u64.u16 %rd35, %rs4;
; CHECK-NEXT: and.b64 %rd36, %rd35, 255;
; CHECK-NEXT: cvt.u64.u16 %rd37, %rs4;
; CHECK-NEXT: and.b64 %rd38, %rd37, 255;
; CHECK-NEXT: shl.b64 %rd39, %rd38, 8;
; CHECK-NEXT: or.b64 %rd40, %rd36, %rd39;
; CHECK-NEXT: cvt.u64.u16 %rd41, %rs5;
; CHECK-NEXT: and.b64 %rd42, %rd41, 255;
; CHECK-NEXT: shl.b64 %rd43, %rd42, 16;
; CHECK-NEXT: or.b64 %rd44, %rd40, %rd43;
; CHECK-NEXT: cvt.u64.u16 %rd45, %rs6;
; CHECK-NEXT: and.b64 %rd46, %rd45, 255;
; CHECK-NEXT: shl.b64 %rd47, %rd46, 24;
; CHECK-NEXT: or.b64 %rd48, %rd44, %rd47;
; CHECK-NEXT: cvt.u64.u16 %rd49, %rs7;
; CHECK-NEXT: and.b64 %rd50, %rd49, 255;
; CHECK-NEXT: shl.b64 %rd51, %rd50, 32;
; CHECK-NEXT: or.b64 %rd52, %rd48, %rd51;
; CHECK-NEXT: cvt.u64.u16 %rd53, %rs8;
; CHECK-NEXT: and.b64 %rd54, %rd53, 255;
; CHECK-NEXT: shl.b64 %rd55, %rd54, 40;
; CHECK-NEXT: or.b64 %rd56, %rd52, %rd55;
; CHECK-NEXT: cvt.u64.u16 %rd57, %rs9;
; CHECK-NEXT: and.b64 %rd58, %rd57, 255;
; CHECK-NEXT: shl.b64 %rd59, %rd58, 48;
; CHECK-NEXT: or.b64 %rd60, %rd56, %rd59;
; CHECK-NEXT: cvt.u64.u16 %rd61, %rs10;
; CHECK-NEXT: shl.b64 %rd62, %rd61, 56;
; CHECK-NEXT: or.b64 %rd63, %rd60, %rd62;
; CHECK-NEXT: shl.b64 %rd37, %rd36, 8;
; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37;
; CHECK-NEXT: cvt.u64.u16 %rd39, %rs5;
; CHECK-NEXT: and.b64 %rd40, %rd39, 255;
; CHECK-NEXT: shl.b64 %rd41, %rd40, 16;
; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41;
; CHECK-NEXT: cvt.u64.u16 %rd43, %rs6;
; CHECK-NEXT: and.b64 %rd44, %rd43, 255;
; CHECK-NEXT: shl.b64 %rd45, %rd44, 24;
; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45;
; CHECK-NEXT: cvt.u64.u16 %rd47, %rs7;
; CHECK-NEXT: and.b64 %rd48, %rd47, 255;
; CHECK-NEXT: shl.b64 %rd49, %rd48, 32;
; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49;
; CHECK-NEXT: cvt.u64.u16 %rd51, %rs8;
; CHECK-NEXT: and.b64 %rd52, %rd51, 255;
; CHECK-NEXT: shl.b64 %rd53, %rd52, 40;
; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53;
; CHECK-NEXT: cvt.u64.u16 %rd55, %rs9;
; CHECK-NEXT: and.b64 %rd56, %rd55, 255;
; CHECK-NEXT: shl.b64 %rd57, %rd56, 48;
; CHECK-NEXT: or.b64 %rd58, %rd54, %rd57;
; CHECK-NEXT: cvt.u64.u16 %rd59, %rs10;
; CHECK-NEXT: shl.b64 %rd60, %rd59, 56;
; CHECK-NEXT: or.b64 %rd61, %rd58, %rd60;
; CHECK-NEXT: st.param.b64 [func_retval0], %rd31;
; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2;
; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd45;
; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd41;
; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd37;
; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd35;
; CHECK-NEXT: shr.u64 %rd64, %rd52, 32;
; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43;
; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd39;
; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35;
; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd33;
; CHECK-NEXT: shr.u64 %rd64, %rd50, 32;
; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd64;
; CHECK-NEXT: shr.u64 %rd65, %rd56, 40;
; CHECK-NEXT: shr.u64 %rd65, %rd54, 40;
; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd65;
; CHECK-NEXT: shr.u64 %rd66, %rd60, 48;
; CHECK-NEXT: shr.u64 %rd66, %rd58, 48;
; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd66;
; CHECK-NEXT: shr.u64 %rd67, %rd63, 56;
; CHECK-NEXT: shr.u64 %rd67, %rd61, 56;
; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd67;
; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd32;
; CHECK-NEXT: ret;

View File

@@ -348,7 +348,7 @@ define dso_local void @qux() {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b32 %r<3>;
; CHECK-PTX-NEXT: .reg .b32 %r<2>;
; CHECK-PTX-NEXT: .reg .b64 %rd<8>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry