[NVPTX] Improve device function byval parameter lowering (#129188)

PTX supports 2 methods of accessing device function parameters:

- "simple" case: If a parameters is only loaded, and all loads can
address the parameter via a constant offset, then the parameter may be
loaded via the ".param" address space. This case is not possible if the
parameters is stored to or has it's address taken. This method is
preferable when possible.

- "move param" case: For more complex cases the address of the param may
be placed in a register via a "mov" instruction. This mov also
implicitly moves the param to the ".local" address space and allows for
it to be written to. This essentially defers the responsibilty of the
byval copy to the PTX calling convention.

The handling of these cases in the NVPTX backend for byval pointers has
some major issues. We currently attempt to determine if a copy is
necessary in NVPTXLowerArgs and either explicitly make an additional
copy in the IR, or insert "addrspacecast" to move the param to the param
address space. Unfortunately the criteria for determining which case is
possible are not correct, leading to miscompilations
(https://godbolt.org/z/Gq1fP7a3G). Further, the criteria for the
"simple" case aren't enforceable in LLVM IR across other transformations
and instruction selection, making deciding between the 2 cases in
NVPTXLowerArgs brittle and buggy.

This patch aims to fix these issues and improve address space related
optimization. In NVPTXLowerArgs, we conservatively assume that all
parameters will use the "move param" case and the local address space.
Responsibility for switching to the "simple" case is given to a new
MachineIR pass, NVPTXForwardParams, which runs once it has become clear
whether or not this is possible. This ensures that the correct address
space is known for the "move param" case allowing for optimization,
while still using the "simple" case where ever possible.
This commit is contained in:
Alex MacLean
2025-02-28 14:15:25 -08:00
committed by GitHub
parent d2cbd5fe6b
commit 0065343159
14 changed files with 609 additions and 198 deletions

View File

@@ -16,6 +16,7 @@ set(NVPTXCodeGen_sources
NVPTXAtomicLower.cpp
NVPTXAsmPrinter.cpp
NVPTXAssignValidGlobalNames.cpp
NVPTXForwardParams.cpp
NVPTXFrameLowering.cpp
NVPTXGenericToNVVM.cpp
NVPTXISelDAGToDAG.cpp

View File

@@ -52,6 +52,7 @@ FunctionPass *createNVPTXLowerUnreachablePass(bool TrapUnreachable,
bool NoTrapAfterNoreturn);
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
MachineFunctionPass *createNVPTXForwardParamsPass();
struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);

View File

@@ -0,0 +1,169 @@
//- NVPTXForwardParams.cpp - NVPTX Forward Device Params Removing Local Copy -//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// PTX supports 2 methods of accessing device function parameters:
//
// - "simple" case: If a parameters is only loaded, and all loads can address
// the parameter via a constant offset, then the parameter may be loaded via
// the ".param" address space. This case is not possible if the parameters
// is stored to or has it's address taken. This method is preferable when
// possible. Ex:
//
// ld.param.u32 %r1, [foo_param_1];
// ld.param.u32 %r2, [foo_param_1+4];
//
// - "move param" case: For more complex cases the address of the param may be
// placed in a register via a "mov" instruction. This "mov" also implicitly
// moves the param to the ".local" address space and allows for it to be
// written to. This essentially defers the responsibilty of the byval copy
// to the PTX calling convention.
//
// mov.b64 %rd1, foo_param_0;
// st.local.u32 [%rd1], 42;
// add.u64 %rd3, %rd1, %rd2;
// ld.local.u32 %r2, [%rd3];
//
// In NVPTXLowerArgs and SelectionDAG, we pessimistically assume that all
// parameters will use the "move param" case and the local address space. This
// pass is responsible for switching to the "simple" case when possible, as it
// is more efficient.
//
// We do this by simply traversing uses of the param "mov" instructions an
// trivially checking if they are all loads.
//
//===----------------------------------------------------------------------===//
#include "NVPTX.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/Support/ErrorHandling.h"
using namespace llvm;
static bool traverseMoveUse(MachineInstr &U, const MachineRegisterInfo &MRI,
SmallVectorImpl<MachineInstr *> &RemoveList,
SmallVectorImpl<MachineInstr *> &LoadInsts) {
switch (U.getOpcode()) {
case NVPTX::LD_f32:
case NVPTX::LD_f64:
case NVPTX::LD_i16:
case NVPTX::LD_i32:
case NVPTX::LD_i64:
case NVPTX::LD_i8:
case NVPTX::LDV_f32_v2:
case NVPTX::LDV_f32_v4:
case NVPTX::LDV_f64_v2:
case NVPTX::LDV_f64_v4:
case NVPTX::LDV_i16_v2:
case NVPTX::LDV_i16_v4:
case NVPTX::LDV_i32_v2:
case NVPTX::LDV_i32_v4:
case NVPTX::LDV_i64_v2:
case NVPTX::LDV_i64_v4:
case NVPTX::LDV_i8_v2:
case NVPTX::LDV_i8_v4: {
LoadInsts.push_back(&U);
return true;
}
case NVPTX::cvta_local:
case NVPTX::cvta_local_64:
case NVPTX::cvta_to_local:
case NVPTX::cvta_to_local_64: {
for (auto &U2 : MRI.use_instructions(U.operands_begin()->getReg()))
if (!traverseMoveUse(U2, MRI, RemoveList, LoadInsts))
return false;
RemoveList.push_back(&U);
return true;
}
default:
return false;
}
}
static bool eliminateMove(MachineInstr &Mov, const MachineRegisterInfo &MRI,
SmallVectorImpl<MachineInstr *> &RemoveList) {
SmallVector<MachineInstr *, 16> MaybeRemoveList;
SmallVector<MachineInstr *, 16> LoadInsts;
for (auto &U : MRI.use_instructions(Mov.operands_begin()->getReg()))
if (!traverseMoveUse(U, MRI, MaybeRemoveList, LoadInsts))
return false;
RemoveList.append(MaybeRemoveList);
RemoveList.push_back(&Mov);
const MachineOperand *ParamSymbol = Mov.uses().begin();
assert(ParamSymbol->isSymbol());
constexpr unsigned LDInstBasePtrOpIdx = 6;
constexpr unsigned LDInstAddrSpaceOpIdx = 2;
for (auto *LI : LoadInsts) {
(LI->uses().begin() + LDInstBasePtrOpIdx)
->ChangeToES(ParamSymbol->getSymbolName());
(LI->uses().begin() + LDInstAddrSpaceOpIdx)
->ChangeToImmediate(NVPTX::AddressSpace::Param);
}
return true;
}
static bool forwardDeviceParams(MachineFunction &MF) {
const auto &MRI = MF.getRegInfo();
bool Changed = false;
SmallVector<MachineInstr *, 16> RemoveList;
for (auto &MI : make_early_inc_range(*MF.begin()))
if (MI.getOpcode() == NVPTX::MOV32_PARAM ||
MI.getOpcode() == NVPTX::MOV64_PARAM)
Changed |= eliminateMove(MI, MRI, RemoveList);
for (auto *MI : RemoveList)
MI->eraseFromParent();
return Changed;
}
/// ----------------------------------------------------------------------------
/// Pass (Manager) Boilerplate
/// ----------------------------------------------------------------------------
namespace llvm {
void initializeNVPTXForwardParamsPassPass(PassRegistry &);
} // namespace llvm
namespace {
struct NVPTXForwardParamsPass : public MachineFunctionPass {
static char ID;
NVPTXForwardParamsPass() : MachineFunctionPass(ID) {
initializeNVPTXForwardParamsPassPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // namespace
char NVPTXForwardParamsPass::ID = 0;
INITIALIZE_PASS(NVPTXForwardParamsPass, "nvptx-forward-params",
"NVPTX Forward Params", false, false)
bool NVPTXForwardParamsPass::runOnMachineFunction(MachineFunction &MF) {
return forwardDeviceParams(MF);
}
MachineFunctionPass *llvm::createNVPTXForwardParamsPass() {
return new NVPTXForwardParamsPass();
}

View File

@@ -2197,11 +2197,11 @@ static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
if (N.getOpcode() == NVPTXISD::Wrapper)
return N.getOperand(0);
// addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
// addrspacecast(Wrapper(arg_symbol) to addrspace(PARAM)) -> arg_symbol
if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N))
if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
CastN->getOperand(0).getOpcode() == NVPTXISD::Wrapper)
return selectBaseADDR(CastN->getOperand(0).getOperand(0), DAG);
if (auto *FIN = dyn_cast<FrameIndexSDNode>(N))

View File

@@ -3376,10 +3376,18 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
assert(ObjectVT == Ins[InsIdx].VT &&
"Ins type did not match function type");
SDValue Arg = getParamSymbol(DAG, i, PtrVT);
SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
if (p.getNode())
p.getNode()->setIROrder(i + 1);
InVals.push_back(p);
SDValue P;
if (isKernelFunction(*F)) {
P = DAG.getNode(NVPTXISD::Wrapper, dl, ObjectVT, Arg);
P.getNode()->setIROrder(i + 1);
} else {
P = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
P.getNode()->setIROrder(i + 1);
P = DAG.getAddrSpaceCast(dl, ObjectVT, P, ADDRESS_SPACE_LOCAL,
ADDRESS_SPACE_GENERIC);
}
InVals.push_back(P);
}
if (!OutChains.empty())

View File

@@ -2324,7 +2324,7 @@ def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
def SDTCallValProfile : SDTypeProfile<1, 0, []>;
def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>;
def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
@@ -2688,29 +2688,14 @@ def DeclareScalarRegInst :
".reg .b$size param$a;",
[(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
!strconcat("mov", asmstr, " \t$dst, $src;"),
[(set T:$dst, (MoveParam T:$src))]>;
class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty, ValueType vt,
string asmstr> :
NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
!strconcat("mov", asmstr, " \t$dst, $src;"),
[(set vt:$dst, (MoveParam texternalsym:$src))]>;
def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;
def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
def MoveParamI16 :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
"cvt.u16.u32 \t$dst, $src;", // ??? Why cvt.u16.u32 ?
[(set i16:$dst, (MoveParam i16:$src))]>;
def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
def MOV64_PARAM : MoveParamSymbolInst<Int64Regs, i64imm, i64, ".b64">;
def MOV32_PARAM : MoveParamSymbolInst<Int32Regs, i32imm, i32, ".b32">;
class PseudoUseParamInst<NVPTXRegClass regclass, ValueType vt> :
NVPTXInst<(outs), (ins regclass:$src),

View File

@@ -153,6 +153,7 @@
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/NVPTXAddrSpace.h"
#include <numeric>
#include <queue>
@@ -373,19 +374,19 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
Type *StructType = Arg->getParamByValType();
const DataLayout &DL = Func->getDataLayout();
uint64_t NewArgAlign =
TLI->getFunctionParamOptimizedAlign(Func, StructType, DL).value();
uint64_t CurArgAlign =
Arg->getAttribute(Attribute::Alignment).getValueAsInt();
const Align NewArgAlign =
TLI->getFunctionParamOptimizedAlign(Func, StructType, DL);
const Align CurArgAlign = Arg->getParamAlign().valueOrOne();
if (CurArgAlign >= NewArgAlign)
return;
LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign << " instead of "
<< CurArgAlign << " for " << *Arg << '\n');
LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign.value()
<< " instead of " << CurArgAlign.value() << " for " << *Arg
<< '\n');
auto NewAlignAttr =
Attribute::get(Func->getContext(), Attribute::Alignment, NewArgAlign);
Attribute::getWithAlignment(Func->getContext(), NewArgAlign);
Arg->removeAttr(Attribute::Alignment);
Arg->addAttr(NewAlignAttr);
@@ -402,24 +403,17 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
SmallVector<Load> Loads;
std::queue<LoadContext> Worklist;
Worklist.push({ArgInParamAS, 0});
bool IsGridConstant = isParamGridConstant(*Arg);
while (!Worklist.empty()) {
LoadContext Ctx = Worklist.front();
Worklist.pop();
for (User *CurUser : Ctx.InitialVal->users()) {
if (auto *I = dyn_cast<LoadInst>(CurUser)) {
if (auto *I = dyn_cast<LoadInst>(CurUser))
Loads.push_back({I, Ctx.Offset});
continue;
}
if (auto *I = dyn_cast<BitCastInst>(CurUser)) {
Worklist.push({I, Ctx.Offset});
continue;
}
if (auto *I = dyn_cast<GetElementPtrInst>(CurUser)) {
else if (isa<BitCastInst>(CurUser) || isa<AddrSpaceCastInst>(CurUser))
Worklist.push({cast<Instruction>(CurUser), Ctx.Offset});
else if (auto *I = dyn_cast<GetElementPtrInst>(CurUser)) {
APInt OffsetAccumulated =
APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM));
@@ -431,26 +425,13 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX");
Worklist.push({I, Ctx.Offset + Offset});
continue;
}
if (isa<MemTransferInst>(CurUser))
continue;
// supported for grid_constant
if (IsGridConstant &&
(isa<CallInst>(CurUser) || isa<StoreInst>(CurUser) ||
isa<PtrToIntInst>(CurUser)))
continue;
llvm_unreachable("All users must be one of: load, "
"bitcast, getelementptr, call, store, ptrtoint");
}
}
for (Load &CurLoad : Loads) {
Align NewLoadAlign(std::gcd(NewArgAlign, CurLoad.Offset));
Align CurLoadAlign(CurLoad.Inst->getAlign());
Align NewLoadAlign(std::gcd(NewArgAlign.value(), CurLoad.Offset));
Align CurLoadAlign = CurLoad.Inst->getAlign();
CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign));
}
}
@@ -641,7 +622,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
copyByValParam(*Func, *Arg);
}
void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
static void markPointerAsAS(Value *Ptr, const unsigned AS) {
if (Ptr->getType()->getPointerAddressSpace() != ADDRESS_SPACE_GENERIC)
return;
@@ -658,8 +639,7 @@ void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
}
Instruction *PtrInGlobal = new AddrSpaceCastInst(
Ptr, PointerType::get(Ptr->getContext(), ADDRESS_SPACE_GLOBAL),
Ptr->getName(), InsertPt);
Ptr, PointerType::get(Ptr->getContext(), AS), Ptr->getName(), InsertPt);
Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),
Ptr->getName(), InsertPt);
// Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.
@@ -667,6 +647,10 @@ void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
PtrInGlobal->setOperand(0, Ptr);
}
void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
markPointerAsAS(Ptr, ADDRESS_SPACE_GLOBAL);
}
// =============================================================================
// Main function for this pass.
// =============================================================================
@@ -724,9 +708,15 @@ bool NVPTXLowerArgs::runOnKernelFunction(const NVPTXTargetMachine &TM,
bool NVPTXLowerArgs::runOnDeviceFunction(const NVPTXTargetMachine &TM,
Function &F) {
LLVM_DEBUG(dbgs() << "Lowering function args of " << F.getName() << "\n");
const auto *TLI =
cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());
for (Argument &Arg : F.args())
if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
handleByValParam(TM, &Arg);
if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {
markPointerAsAS(&Arg, ADDRESS_SPACE_LOCAL);
adjustByValArgAlignment(&Arg, &Arg, TLI);
}
return true;
}

View File

@@ -100,6 +100,7 @@ void initializeNVPTXLowerUnreachablePass(PassRegistry &);
void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &);
void initializeNVPTXLowerArgsPass(PassRegistry &);
void initializeNVPTXProxyRegErasurePass(PassRegistry &);
void initializeNVPTXForwardParamsPassPass(PassRegistry &);
void initializeNVVMIntrRangePass(PassRegistry &);
void initializeNVVMReflectPass(PassRegistry &);
void initializeNVPTXAAWrapperPassPass(PassRegistry &);
@@ -127,6 +128,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
initializeNVPTXCtorDtorLoweringLegacyPass(PR);
initializeNVPTXLowerAggrCopiesPass(PR);
initializeNVPTXProxyRegErasurePass(PR);
initializeNVPTXForwardParamsPassPass(PR);
initializeNVPTXDAGToDAGISelLegacyPass(PR);
initializeNVPTXAAWrapperPassPass(PR);
initializeNVPTXExternalAAWrapperPass(PR);
@@ -429,6 +431,7 @@ bool NVPTXPassConfig::addInstSelector() {
}
void NVPTXPassConfig::addPreRegAlloc() {
addPass(createNVPTXForwardParamsPass());
// Remove Proxy Register pseudo instructions used to keep `callseq_end` alive.
addPass(createNVPTXProxyRegErasurePass());
}

View File

@@ -0,0 +1,138 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s | FileCheck %s
target triple = "nvptx64-nvidia-cuda"
define i32 @test_ld_param_const(ptr byval(i32) %a) {
; CHECK-LABEL: test_ld_param_const(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_const_param_0+4];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%p2 = getelementptr i32, ptr %a, i32 1
%ld = load i32, ptr %p2
ret i32 %ld
}
define i32 @test_ld_param_non_const(ptr byval([10 x i32]) %a, i32 %b) {
; CHECK-LABEL: test_ld_param_non_const(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: mov.b64 %rd1, test_ld_param_non_const_param_0;
; CHECK-NEXT: ld.param.s32 %rd2, [test_ld_param_non_const_param_1];
; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
; CHECK-NEXT: ld.local.u32 %r1, [%rd3];
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%p2 = getelementptr i8, ptr %a, i32 %b
%ld = load i32, ptr %p2
ret i32 %ld
}
declare void @escape(ptr)
declare void @byval_user(ptr byval(i32))
define void @test_ld_param_escaping(ptr byval(i32) %a) {
; CHECK-LABEL: test_ld_param_escaping(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: mov.b64 %rd1, test_ld_param_escaping_param_0;
; CHECK-NEXT: cvta.local.u64 %rd2, %rd1;
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .b64 param0;
; CHECK-NEXT: st.param.b64 [param0], %rd2;
; CHECK-NEXT: call.uni
; CHECK-NEXT: escape,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: ret;
call void @escape(ptr %a)
ret void
}
define void @test_ld_param_byval(ptr byval(i32) %a) {
; CHECK-LABEL: test_ld_param_byval(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_byval_param_0];
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 4 .b8 param0[4];
; CHECK-NEXT: st.param.b32 [param0], %r1;
; CHECK-NEXT: call.uni
; CHECK-NEXT: byval_user,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK-NEXT: } // callseq 1
; CHECK-NEXT: ret;
call void @byval_user(ptr %a)
ret void
}
define i32 @test_modify_param(ptr byval([10 x i32]) %a, i32 %b, i32 %c ) {
; CHECK-LABEL: test_modify_param(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: mov.b64 %rd1, test_modify_param_param_0;
; CHECK-NEXT: ld.param.u32 %r1, [test_modify_param_param_1];
; CHECK-NEXT: ld.param.u32 %r2, [test_modify_param_param_2];
; CHECK-NEXT: st.local.u32 [%rd1+2], %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%p2 = getelementptr i8, ptr %a, i32 2
store volatile i32 %b, ptr %p2
ret i32 %c
}
define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) {
; CHECK-LABEL: test_multi_block(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u8 %rs1, [test_multi_block_param_1];
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
; CHECK-NEXT: not.pred %p2, %p1;
; CHECK-NEXT: @%p2 bra $L__BB5_2;
; CHECK-NEXT: // %bb.1: // %if
; CHECK-NEXT: ld.param.u32 %r4, [test_multi_block_param_0+4];
; CHECK-NEXT: bra.uni $L__BB5_3;
; CHECK-NEXT: $L__BB5_2: // %else
; CHECK-NEXT: ld.param.u32 %r4, [test_multi_block_param_0+8];
; CHECK-NEXT: $L__BB5_3: // %end
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NEXT: ret;
br i1 %p, label %if, label %else
if:
%p2 = getelementptr i8, ptr %a, i32 4
%v2 = load i32, ptr %p2
br label %end
else:
%p3 = getelementptr i8, ptr %a, i32 8
%v3 = load i32, ptr %p3
br label %end
end:
%v = phi i32 [ %v2, %if ], [ %v3, %else ]
ret i32 %v
}

View File

@@ -27,16 +27,15 @@ define [2 x i128] @foo(i64 %a, i32 %b) {
define [2 x i128] @foo2(ptr byval([2 x i128]) %a) {
; CHECK-LABEL: foo2(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<6>;
; CHECK-NEXT: .reg .b64 %rd<7>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: mov.b64 %rd1, foo2_param_0;
; CHECK-NEXT: ld.param.u64 %rd2, [foo2_param_0+8];
; CHECK-NEXT: ld.param.u64 %rd3, [foo2_param_0];
; CHECK-NEXT: ld.param.u64 %rd4, [foo2_param_0+24];
; CHECK-NEXT: ld.param.u64 %rd5, [foo2_param_0+16];
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd5, %rd4};
; CHECK-NEXT: ld.param.u64 %rd3, [foo2_param_0+8];
; CHECK-NEXT: ld.param.u64 %rd4, [foo2_param_0];
; CHECK-NEXT: ld.param.u64 %rd5, [foo2_param_0+24];
; CHECK-NEXT: ld.param.u64 %rd6, [foo2_param_0+16];
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3};
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd6, %rd5};
; CHECK-NEXT: ret;
%ptr0 = getelementptr [2 x i128], ptr %a, i64 0, i32 0
%1 = load i128, i128* %ptr0

View File

@@ -12,9 +12,8 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
; OPT-LABEL: define dso_local noundef i32 @non_kernel_function(
; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; OPT-NEXT: [[ENTRY:.*:]]
; OPT-NEXT: [[A1:%.*]] = alloca [[STRUCT_UINT4]], align 16
; OPT-NEXT: [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(101)
; OPT-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 16 [[A1]], ptr addrspace(101) align 16 [[A2]], i64 16, i1 false)
; OPT-NEXT: [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(5)
; OPT-NEXT: [[A1:%.*]] = addrspacecast ptr addrspace(5) [[A2]] to ptr
; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr)
; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64
; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]]
@@ -23,38 +22,29 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly
;
; PTX-LABEL: non_kernel_function(
; PTX: {
; PTX-NEXT: .local .align 16 .b8 __local_depot0[16];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b16 %rs<3>;
; PTX-NEXT: .reg .b32 %r<11>;
; PTX-NEXT: .reg .b64 %rd<10>;
; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.u64 %SPL, __local_depot0;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: mov.b64 %rd1, non_kernel_function_param_0;
; PTX-NEXT: cvta.local.u64 %rd2, %rd1;
; PTX-NEXT: ld.param.u8 %rs1, [non_kernel_function_param_1];
; PTX-NEXT: and.b16 %rs2, %rs1, 1;
; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1;
; PTX-NEXT: add.u64 %rd1, %SP, 0;
; PTX-NEXT: add.u64 %rd2, %SPL, 0;
; PTX-NEXT: ld.param.s32 %rd3, [non_kernel_function_param_2];
; PTX-NEXT: ld.param.u64 %rd4, [non_kernel_function_param_0+8];
; PTX-NEXT: st.local.u64 [%rd2+8], %rd4;
; PTX-NEXT: ld.param.u64 %rd5, [non_kernel_function_param_0];
; PTX-NEXT: st.local.u64 [%rd2], %rd5;
; PTX-NEXT: mov.u64 %rd6, gi;
; PTX-NEXT: cvta.global.u64 %rd7, %rd6;
; PTX-NEXT: selp.b64 %rd8, %rd1, %rd7, %p1;
; PTX-NEXT: add.s64 %rd9, %rd8, %rd3;
; PTX-NEXT: ld.u8 %r1, [%rd9];
; PTX-NEXT: ld.u8 %r2, [%rd9+1];
; PTX-NEXT: mov.u64 %rd3, gi;
; PTX-NEXT: cvta.global.u64 %rd4, %rd3;
; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1;
; PTX-NEXT: ld.param.s32 %rd6, [non_kernel_function_param_2];
; PTX-NEXT: add.s64 %rd7, %rd5, %rd6;
; PTX-NEXT: ld.u8 %r1, [%rd7];
; PTX-NEXT: ld.u8 %r2, [%rd7+1];
; PTX-NEXT: shl.b32 %r3, %r2, 8;
; PTX-NEXT: or.b32 %r4, %r3, %r1;
; PTX-NEXT: ld.u8 %r5, [%rd9+2];
; PTX-NEXT: ld.u8 %r5, [%rd7+2];
; PTX-NEXT: shl.b32 %r6, %r5, 16;
; PTX-NEXT: ld.u8 %r7, [%rd9+3];
; PTX-NEXT: ld.u8 %r7, [%rd7+3];
; PTX-NEXT: shl.b32 %r8, %r7, 24;
; PTX-NEXT: or.b32 %r9, %r8, %r6;
; PTX-NEXT: or.b32 %r10, %r9, %r4;
@@ -91,6 +81,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4
; OPT-NEXT: ret void
;
%tmp = load i32, ptr %input1, align 4
%add = add i32 %tmp, %input2
store i32 %add, ptr %out
@@ -125,6 +116,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4
; OPT-NEXT: ret void
;
%gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%int1 = load i32, ptr %gep1
@@ -165,6 +157,7 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
;
%call = call i32 @escape(ptr %input)
ret void
}
@@ -222,6 +215,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
; OPT-NEXT: ret void
;
%a.addr = alloca i32, align 4
store i32 %a, ptr %a.addr, align 4
%call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
@@ -249,6 +243,7 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i
; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8
; OPT-NEXT: ret void
;
store ptr %input, ptr %addr, align 8
ret void
}
@@ -282,6 +277,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8
; OPT-NEXT: ret void
;
%tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
@@ -330,6 +326,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou
; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
; OPT-NEXT: ret void
;
%val = load i32, ptr %input
%twice = add i32 %val, %val
store i32 %twice, ptr %output
@@ -383,6 +380,7 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input,
; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
; OPT-NEXT: ret i32 [[ADD]]
;
%ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%val1 = load i32, ptr %ptr1
%ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -435,6 +433,7 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
; OPT-NEXT: ret void
;
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
@@ -500,6 +499,7 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1,
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
; OPT-NEXT: ret void
;
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
br i1 %less, label %first, label %second
@@ -553,6 +553,7 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
; OPT-NEXT: ret void
;
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
%ptrnew = select i1 %less, ptr %input1, ptr %input2
@@ -584,6 +585,7 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
; OPT-NEXT: ret i32 [[KEEPALIVE]]
;
%val = load i32, ptr %input
%ptrval = ptrtoint ptr %input to i32
%keepalive = add i32 %val, %ptrval

View File

@@ -1,7 +1,8 @@
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,IR,IRC
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes COMMON,IR,IRO
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,PTX,PTXC
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes COMMON,PTX,PTXO
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC
; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes PTX,PTXO
; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
@@ -12,12 +13,51 @@ target triple = "nvptx64-nvidia-cuda"
%class.padded = type { i8, i32 }
; Check that nvptx-lower-args preserves arg alignment
; COMMON-LABEL: load_alignment
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
; IR-LABEL: define void @load_alignment(
; IR-SAME: ptr readonly byval([[CLASS_OUTER:%.*]]) align 8 captures(none) [[ARG:%.*]]) {
; IR-NEXT: [[ENTRY:.*:]]
; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(5)
; IR-NEXT: [[ARG1:%.*]] = addrspacecast ptr addrspace(5) [[ARG2]] to ptr
; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG1]], align 8
; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 0, i32 1
; IR-NEXT: [[ARG_IDX1_VAL:%.*]] = load ptr, ptr [[ARG_IDX1]], align 8
; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 1
; IR-NEXT: [[ARG_IDX2_VAL:%.*]] = load i32, ptr [[ARG_IDX2]], align 8
; IR-NEXT: [[ARG_IDX_VAL_VAL:%.*]] = load i32, ptr [[ARG_IDX_VAL]], align 4
; IR-NEXT: [[ADD_I:%.*]] = add nsw i32 [[ARG_IDX_VAL_VAL]], [[ARG_IDX2_VAL]]
; IR-NEXT: store i32 [[ADD_I]], ptr [[ARG_IDX1_VAL]], align 4
; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull [[ARG_IDX2]])
; IR-NEXT: ret void
;
; PTX-LABEL: load_alignment(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.b64 %rd1, load_alignment_param_0;
; PTX-NEXT: ld.local.u64 %rd2, [%rd1];
; PTX-NEXT: ld.local.u64 %rd3, [%rd1+8];
; PTX-NEXT: add.s64 %rd4, %rd1, 16;
; PTX-NEXT: cvta.local.u64 %rd5, %rd4;
; PTX-NEXT: ld.local.u32 %r1, [%rd1+16];
; PTX-NEXT: ld.u32 %r2, [%rd2];
; PTX-NEXT: add.s32 %r3, %r2, %r1;
; PTX-NEXT: st.u32 [%rd3], %r3;
; PTX-NEXT: { // callseq 0, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: st.param.b64 [param0], %rd5;
; PTX-NEXT: .param .b64 retval0;
; PTX-NEXT: call.uni (retval0),
; PTX-NEXT: escape,
; PTX-NEXT: (
; PTX-NEXT: param0
; PTX-NEXT: );
; PTX-NEXT: ld.param.b64 %rd6, [retval0];
; PTX-NEXT: } // callseq 0
; PTX-NEXT: ret;
entry:
; IR: call void @llvm.memcpy.p0.p101.i64(ptr align 8
; PTX: ld.param.u64
; PTX-NOT: ld.param.u8
%arg.idx.val = load ptr, ptr %arg, align 8
%arg.idx1 = getelementptr %class.outer, ptr %arg, i64 0, i32 0, i32 1
%arg.idx1.val = load ptr, ptr %arg.idx1, align 8
@@ -34,86 +74,146 @@ entry:
}
; Check that nvptx-lower-args copies padding as the struct may have been a union
; COMMON-LABEL: load_padding
define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) {
; IR-LABEL: define void @load_padding(
; IR-SAME: ptr readonly byval([[CLASS_PADDED:%.*]]) align 4 captures(none) [[ARG:%.*]]) {
; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(5)
; IR-NEXT: [[ARG1:%.*]] = addrspacecast ptr addrspace(5) [[ARG2]] to ptr
; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG1]])
; IR-NEXT: ret void
;
; PTX-LABEL: load_padding(
; PTX: {
; PTX-NEXT: .local .align 8 .b8 __local_depot1[8];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.u64 %SPL, __local_depot1;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: add.u64 %rd1, %SP, 0;
; PTX-NEXT: add.u64 %rd2, %SPL, 0;
; PTX-NEXT: ld.param.u64 %rd3, [load_padding_param_0];
; PTX-NEXT: st.local.u64 [%rd2], %rd3;
; PTX-NEXT: mov.b64 %rd1, load_padding_param_0;
; PTX-NEXT: cvta.local.u64 %rd2, %rd1;
; PTX-NEXT: { // callseq 1, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: st.param.b64 [param0], %rd1;
; PTX-NEXT: st.param.b64 [param0], %rd2;
; PTX-NEXT: .param .b64 retval0;
; PTX-NEXT: call.uni (retval0),
; PTX-NEXT: escape,
; PTX-NEXT: (
; PTX-NEXT: param0
; PTX-NEXT: );
; PTX-NEXT: ld.param.b64 %rd4, [retval0];
; PTX-NEXT: ld.param.b64 %rd3, [retval0];
; PTX-NEXT: } // callseq 1
; PTX-NEXT: ret;
%tmp = call ptr @escape(ptr nonnull align 16 %arg)
ret void
}
; COMMON-LABEL: ptr_generic
define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
; IRC: %in3 = addrspacecast ptr %in to ptr addrspace(1)
; IRC: %in4 = addrspacecast ptr addrspace(1) %in3 to ptr
; IRC: %out1 = addrspacecast ptr %out to ptr addrspace(1)
; IRC: %out2 = addrspacecast ptr addrspace(1) %out1 to ptr
; PTXC: cvta.to.global.u64
; PTXC: cvta.to.global.u64
; PTXC: ld.global.u32
; PTXC: st.global.u32
; OpenCL can't make assumptions about incoming pointer, so we should generate
; generic pointers load/store.
; IRO-NOT: addrspacecast
; PTXO-NOT: cvta.to.global
; PTXO: ld.u32
; PTXO: st.u32
define ptx_kernel void @ptr_generic(ptr %out, ptr %in) {
; IRC-LABEL: define ptx_kernel void @ptr_generic(
; IRC-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) {
; IRC-NEXT: [[IN3:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
; IRC-NEXT: [[IN4:%.*]] = addrspacecast ptr addrspace(1) [[IN3]] to ptr
; IRC-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
; IRC-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
; IRC-NEXT: [[V:%.*]] = load i32, ptr [[IN4]], align 4
; IRC-NEXT: store i32 [[V]], ptr [[OUT2]], align 4
; IRC-NEXT: ret void
;
; IRO-LABEL: define ptx_kernel void @ptr_generic(
; IRO-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) {
; IRO-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4
; IRO-NEXT: store i32 [[V]], ptr [[OUT]], align 4
; IRO-NEXT: ret void
;
; PTXC-LABEL: ptr_generic(
; PTXC: {
; PTXC-NEXT: .reg .b32 %r<2>;
; PTXC-NEXT: .reg .b64 %rd<5>;
; PTXC-EMPTY:
; PTXC-NEXT: // %bb.0:
; PTXC-NEXT: ld.param.u64 %rd1, [ptr_generic_param_0];
; PTXC-NEXT: ld.param.u64 %rd2, [ptr_generic_param_1];
; PTXC-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTXC-NEXT: cvta.to.global.u64 %rd4, %rd1;
; PTXC-NEXT: ld.global.u32 %r1, [%rd3];
; PTXC-NEXT: st.global.u32 [%rd4], %r1;
; PTXC-NEXT: ret;
;
; PTXO-LABEL: ptr_generic(
; PTXO: {
; PTXO-NEXT: .reg .b32 %r<2>;
; PTXO-NEXT: .reg .b64 %rd<3>;
; PTXO-EMPTY:
; PTXO-NEXT: // %bb.0:
; PTXO-NEXT: ld.param.u64 %rd1, [ptr_generic_param_0];
; PTXO-NEXT: ld.param.u64 %rd2, [ptr_generic_param_1];
; PTXO-NEXT: ld.u32 %r1, [%rd2];
; PTXO-NEXT: st.u32 [%rd1], %r1;
; PTXO-NEXT: ret;
%v = load i32, ptr %in, align 4
store i32 %v, ptr %out, align 4
ret void
}
; COMMON-LABEL: ptr_nongeneric
define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; IR-NOT: addrspacecast
; PTX-NOT: cvta.to.global
; PTX: ld.shared.u32
; PTX st.global.u32
; IR-LABEL: define ptx_kernel void @ptr_nongeneric(
; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) {
; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4
; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; IR-NEXT: ret void
;
; PTX-LABEL: ptr_nongeneric(
; PTX: {
; PTX-NEXT: .reg .b32 %r<2>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.u64 %rd1, [ptr_nongeneric_param_0];
; PTX-NEXT: ld.param.u64 %rd2, [ptr_nongeneric_param_1];
; PTX-NEXT: ld.shared.u32 %r1, [%rd2];
; PTX-NEXT: st.global.u32 [%rd1], %r1;
; PTX-NEXT: ret;
%v = load i32, ptr addrspace(3) %in, align 4
store i32 %v, ptr addrspace(1) %out, align 4
ret void
}
; COMMON-LABEL: ptr_as_int
define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
; IR: [[P:%.*]] = inttoptr i64 %i to ptr
; IRC: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
; IRC: addrspacecast ptr addrspace(1) [[P1]] to ptr
; IRO-NOT: addrspacecast
; PTXC-DAG: ld.param.u64 [[I:%rd.*]], [ptr_as_int_param_0];
; PTXC-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_param_1];
; PTXC: cvta.to.global.u64 %[[P:rd.*]], [[I]];
; PTXC: st.global.u32 [%[[P]]], [[V]];
; PTXO-DAG: ld.param.u64 %[[P:rd.*]], [ptr_as_int_param_0];
; PTXO-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_param_1];
; PTXO: st.u32 [%[[P]]], [[V]];
define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int(
; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {
; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
; IRC-NEXT: [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
; IRC-NEXT: store i32 [[V]], ptr [[P2]], align 4
; IRC-NEXT: ret void
;
; IRO-LABEL: define ptx_kernel void @ptr_as_int(
; IRO-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) {
; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
; IRO-NEXT: ret void
;
; PTXC-LABEL: ptr_as_int(
; PTXC: {
; PTXC-NEXT: .reg .b32 %r<2>;
; PTXC-NEXT: .reg .b64 %rd<3>;
; PTXC-EMPTY:
; PTXC-NEXT: // %bb.0:
; PTXC-NEXT: ld.param.u64 %rd1, [ptr_as_int_param_0];
; PTXC-NEXT: ld.param.u32 %r1, [ptr_as_int_param_1];
; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1;
; PTXC-NEXT: st.global.u32 [%rd2], %r1;
; PTXC-NEXT: ret;
;
; PTXO-LABEL: ptr_as_int(
; PTXO: {
; PTXO-NEXT: .reg .b32 %r<2>;
; PTXO-NEXT: .reg .b64 %rd<2>;
; PTXO-EMPTY:
; PTXO-NEXT: // %bb.0:
; PTXO-NEXT: ld.param.u64 %rd1, [ptr_as_int_param_0];
; PTXO-NEXT: ld.param.u32 %r1, [ptr_as_int_param_1];
; PTXO-NEXT: st.u32 [%rd1], %r1;
; PTXO-NEXT: ret;
%p = inttoptr i64 %i to ptr
store i32 %v, ptr %p, align 4
ret void
@@ -121,29 +221,52 @@ define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %
%struct.S = type { i64 }
; COMMON-LABEL: ptr_as_int_aggr
define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
; IR: [[S:%.*]] = addrspacecast ptr %s to ptr addrspace(101)
; IR: [[I:%.*]] = load i64, ptr addrspace(101) [[S]], align 8
; IR: [[P0:%.*]] = inttoptr i64 [[I]] to ptr
; IRC: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
; IRC: [[P:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
; IRO-NOT: addrspacecast
; PTXC-DAG: ld.param.u64 [[I:%rd.*]], [ptr_as_int_aggr_param_0];
; PTXC-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_aggr_param_1];
; PTXC: cvta.to.global.u64 %[[P:rd.*]], [[I]];
; PTXC: st.global.u32 [%[[P]]], [[V]];
; PTXO-DAG: ld.param.u64 %[[P:rd.*]], [ptr_as_int_aggr_param_0];
; PTXO-DAG: ld.param.u32 [[V:%r.*]], [ptr_as_int_aggr_param_1];
; PTXO: st.u32 [%[[P]]], [[V]];
; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
; IRC-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8
; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
; IRC-NEXT: [[P2:%.*]] = addrspacecast ptr addrspace(1) [[P1]] to ptr
; IRC-NEXT: store i32 [[V]], ptr [[P2]], align 4
; IRC-NEXT: ret void
;
; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
; IRO-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8
; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4
; IRO-NEXT: ret void
;
; PTXC-LABEL: ptr_as_int_aggr(
; PTXC: {
; PTXC-NEXT: .reg .b32 %r<2>;
; PTXC-NEXT: .reg .b64 %rd<3>;
; PTXC-EMPTY:
; PTXC-NEXT: // %bb.0:
; PTXC-NEXT: ld.param.u32 %r1, [ptr_as_int_aggr_param_1];
; PTXC-NEXT: ld.param.u64 %rd1, [ptr_as_int_aggr_param_0];
; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1;
; PTXC-NEXT: st.global.u32 [%rd2], %r1;
; PTXC-NEXT: ret;
;
; PTXO-LABEL: ptr_as_int_aggr(
; PTXO: {
; PTXO-NEXT: .reg .b32 %r<2>;
; PTXO-NEXT: .reg .b64 %rd<2>;
; PTXO-EMPTY:
; PTXO-NEXT: // %bb.0:
; PTXO-NEXT: ld.param.u32 %r1, [ptr_as_int_aggr_param_1];
; PTXO-NEXT: ld.param.u64 %rd1, [ptr_as_int_aggr_param_0];
; PTXO-NEXT: st.u32 [%rd1], %r1;
; PTXO-NEXT: ret;
%i = load i64, ptr %s, align 8
%p = inttoptr i64 %i to ptr
store i32 %v, ptr %p, align 4
ret void
}
; Function Attrs: convergent nounwind
declare dso_local ptr @escape(ptr) local_unnamed_addr

View File

@@ -338,18 +338,18 @@ define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, .
; CHECK-PTX-LABEL: variadics4(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .reg .b32 %r<2>;
; CHECK-PTX-NEXT: .reg .b64 %rd<9>;
; CHECK-PTX-NEXT: .reg .b64 %rd<10>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics4_param_1];
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7;
; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8;
; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3];
; CHECK-PTX-NEXT: ld.param.u64 %rd5, [variadics4_param_0];
; CHECK-PTX-NEXT: ld.param.u64 %rd6, [variadics4_param_0+8];
; CHECK-PTX-NEXT: add.s64 %rd7, %rd5, %rd6;
; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd4;
; CHECK-PTX-NEXT: cvt.u32.u64 %r1, %rd8;
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [variadics4_param_1];
; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 7;
; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8;
; CHECK-PTX-NEXT: ld.u64 %rd5, [%rd4];
; CHECK-PTX-NEXT: ld.param.u64 %rd6, [variadics4_param_0];
; CHECK-PTX-NEXT: ld.param.u64 %rd7, [variadics4_param_0+8];
; CHECK-PTX-NEXT: add.s64 %rd8, %rd6, %rd7;
; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, %rd5;
; CHECK-PTX-NEXT: cvt.u32.u64 %r1, %rd9;
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-PTX-NEXT: ret;
entry:

View File

@@ -6,28 +6,18 @@
define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: caller_St8x4(
; CHECK: {
; CHECK-NEXT: .local .align 8 .b8 __local_depot0[32];
; CHECK-NEXT: .reg .b32 %SP;
; CHECK-NEXT: .reg .b32 %SPL;
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-NEXT: .reg .b64 %rd<13>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: mov.u32 %SPL, __local_depot0;
; CHECK-NEXT: ld.param.u32 %r1, [caller_St8x4_param_1];
; CHECK-NEXT: add.u32 %r3, %SPL, 0;
; CHECK-NEXT: ld.param.u64 %rd1, [caller_St8x4_param_0+24];
; CHECK-NEXT: st.local.u64 [%r3+24], %rd1;
; CHECK-NEXT: ld.param.u64 %rd2, [caller_St8x4_param_0+16];
; CHECK-NEXT: st.local.u64 [%r3+16], %rd2;
; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0+8];
; CHECK-NEXT: st.local.u64 [%r3+8], %rd3;
; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0];
; CHECK-NEXT: st.local.u64 [%r3], %rd4;
; CHECK-NEXT: ld.param.u64 %rd1, [caller_St8x4_param_0+8];
; CHECK-NEXT: ld.param.u64 %rd2, [caller_St8x4_param_0];
; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0+24];
; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0+16];
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 16 .b8 param0[32];
; CHECK-NEXT: st.param.v2.b64 [param0], {%rd4, %rd3};
; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd2, %rd1};
; CHECK-NEXT: st.param.v2.b64 [param0], {%rd2, %rd1};
; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd4, %rd3};
; CHECK-NEXT: .param .align 16 .b8 retval0[32];
; CHECK-NEXT: call.uni (retval0),
; CHECK-NEXT: callee_St8x4,
@@ -37,10 +27,11 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0];
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16];
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: st.u64 [%r1], %rd5;
; CHECK-NEXT: st.u64 [%r1+8], %rd6;
; CHECK-NEXT: st.u64 [%r1+16], %rd7;
; CHECK-NEXT: st.u64 [%r1+24], %rd8;
; CHECK-NEXT: ld.param.u32 %r3, [caller_St8x4_param_1];
; CHECK-NEXT: st.u64 [%r3], %rd5;
; CHECK-NEXT: st.u64 [%r3+8], %rd6;
; CHECK-NEXT: st.u64 [%r3+16], %rd7;
; CHECK-NEXT: st.u64 [%r3+24], %rd8;
; CHECK-NEXT: ret;
%call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2
%.fca.0.extract = extractvalue [4 x i64] %call, 0
@@ -61,6 +52,7 @@ define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly by
; CHECK-LABEL: callee_St8x4(
; CHECK: // @callee_St8x4
; CHECK-NEXT: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: