Files
clang-p2996/polly/lib/CodeGen/PTXGenerator.cpp
Chandler Carruth d01918fa13 [PM] Convert Polly over to directly use the legacy pass manager
namespace and header rather than the top-level header and using
declarations. These helpers impede modular builds and are going away.
Migrating away from them will also be necessary to start mixing in any
usage of the new pass manager.

llvm-svn: 229091
2015-02-13 09:51:50 +00:00

711 lines
26 KiB
C++

//===------ PTXGenerator.cpp - IR helper to create loops -----------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains functions to create GPU parallel codes as LLVM-IR.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/PTXGenerator.h"
#ifdef GPU_CODEGEN
#include "polly/ScopDetection.h"
#include "polly/ScopInfo.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
using namespace llvm;
using namespace polly;
PTXGenerator::PTXGenerator(PollyIRBuilder &Builder, Pass *P,
const std::string &Triple)
: Builder(Builder), P(P), GPUTriple(Triple), GridWidth(1), GridHeight(1),
BlockWidth(1), BlockHeight(1), OutputBytes(0) {
InitializeGPUDataTypes();
}
Module *PTXGenerator::getModule() {
return Builder.GetInsertBlock()->getParent()->getParent();
}
Function *PTXGenerator::createSubfunctionDefinition(int NumArgs) {
assert(NumArgs == 1 && "we support only one array access now.");
Module *M = getModule();
Function *F = Builder.GetInsertBlock()->getParent();
std::vector<Type *> Arguments;
for (int i = 0; i < NumArgs; i++)
Arguments.push_back(Builder.getInt8PtrTy());
FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
Function *FN = Function::Create(FT, Function::InternalLinkage,
F->getName() + "_ptx_subfn", M);
FN->setCallingConv(CallingConv::PTX_Kernel);
// Do not run any optimization pass on the new function.
P->getAnalysis<polly::ScopDetection>().markFunctionAsInvalid(FN);
for (Function::arg_iterator AI = FN->arg_begin(); AI != FN->arg_end(); ++AI)
AI->setName("ptx.Array");
return FN;
}
void PTXGenerator::createSubfunction(SetVector<Value *> &UsedValues,
SetVector<Value *> &OriginalIVS,
PTXGenerator::ValueToValueMapTy &VMap,
Function **SubFunction) {
Function *FN = createSubfunctionDefinition(UsedValues.size());
Module *M = getModule();
LLVMContext &Context = FN->getContext();
IntegerType *Ty = Builder.getInt64Ty();
// Store the previous basic block.
BasicBlock *PrevBB = Builder.GetInsertBlock();
// Create basic blocks.
BasicBlock *HeaderBB = BasicBlock::Create(Context, "ptx.setup", FN);
BasicBlock *ExitBB = BasicBlock::Create(Context, "ptx.exit", FN);
BasicBlock *BodyBB = BasicBlock::Create(Context, "ptx.loop_body", FN);
DominatorTree &DT = P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
DT.addNewBlock(HeaderBB, PrevBB);
DT.addNewBlock(ExitBB, HeaderBB);
DT.addNewBlock(BodyBB, HeaderBB);
Builder.SetInsertPoint(HeaderBB);
// Insert VMap items with maps of array base address on the host to base
// address on the device.
Function::arg_iterator AI = FN->arg_begin();
for (unsigned j = 0; j < UsedValues.size(); j++) {
Value *BaseAddr = UsedValues[j];
Type *ArrayTy = BaseAddr->getType();
Value *Param = Builder.CreateBitCast(AI, ArrayTy);
VMap.insert(std::make_pair(BaseAddr, Param));
AI++;
}
// FIXME: These intrinsics should be inserted on-demand. However, we insert
// them all currently for simplicity.
Function *GetNctaidX =
Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_x);
Function *GetNctaidY =
Intrinsic::getDeclaration(M, Intrinsic::ptx_read_nctaid_y);
Function *GetCtaidX =
Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_x);
Function *GetCtaidY =
Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ctaid_y);
Function *GetNtidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_x);
Function *GetNtidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_ntid_y);
Function *GetTidX = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_x);
Function *GetTidY = Intrinsic::getDeclaration(M, Intrinsic::ptx_read_tid_y);
Value *GridWidth = Builder.CreateCall(GetNctaidX);
GridWidth = Builder.CreateIntCast(GridWidth, Ty, false);
Value *GridHeight = Builder.CreateCall(GetNctaidY);
GridHeight = Builder.CreateIntCast(GridHeight, Ty, false);
Value *BlockWidth = Builder.CreateCall(GetNtidX);
BlockWidth = Builder.CreateIntCast(BlockWidth, Ty, false);
Value *BlockHeight = Builder.CreateCall(GetNtidY);
BlockHeight = Builder.CreateIntCast(BlockHeight, Ty, false);
Value *BIDx = Builder.CreateCall(GetCtaidX);
BIDx = Builder.CreateIntCast(BIDx, Ty, false);
Value *BIDy = Builder.CreateCall(GetCtaidY);
BIDy = Builder.CreateIntCast(BIDy, Ty, false);
Value *TIDx = Builder.CreateCall(GetTidX);
TIDx = Builder.CreateIntCast(TIDx, Ty, false);
Value *TIDy = Builder.CreateCall(GetTidY);
TIDy = Builder.CreateIntCast(TIDy, Ty, false);
Builder.CreateBr(BodyBB);
Builder.SetInsertPoint(BodyBB);
unsigned NumDims = OriginalIVS.size();
std::vector<Value *> Substitutions;
Value *BlockID, *ThreadID;
switch (NumDims) {
case 1: {
Value *BlockSize =
Builder.CreateMul(BlockWidth, BlockHeight, "p_gpu_blocksize");
BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
BlockID = Builder.CreateAdd(BlockID, BIDx);
BlockID = Builder.CreateMul(BlockID, BlockSize);
ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
ThreadID = Builder.CreateAdd(ThreadID, TIDx);
ThreadID = Builder.CreateAdd(ThreadID, BlockID);
Substitutions.push_back(ThreadID);
break;
}
case 2: {
BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
BlockID = Builder.CreateAdd(BlockID, BIDx);
Substitutions.push_back(BlockID);
ThreadID = Builder.CreateMul(TIDy, BlockWidth, "p_gpu_index_j");
ThreadID = Builder.CreateAdd(ThreadID, TIDx);
Substitutions.push_back(ThreadID);
break;
}
case 3: {
BlockID = Builder.CreateMul(BIDy, GridWidth, "p_gpu_index_i");
BlockID = Builder.CreateAdd(BlockID, BIDx);
Substitutions.push_back(BlockID);
Substitutions.push_back(TIDy);
Substitutions.push_back(TIDx);
break;
}
case 4: {
Substitutions.push_back(BIDy);
Substitutions.push_back(BIDx);
Substitutions.push_back(TIDy);
Substitutions.push_back(TIDx);
break;
}
default:
assert(true &&
"We cannot transform parallel loops whose depth is larger than 4.");
return;
}
assert(OriginalIVS.size() == Substitutions.size() &&
"The size of IVS should be equal to the size of substitutions.");
for (unsigned i = 0; i < OriginalIVS.size(); ++i) {
VMap.insert(std::make_pair(OriginalIVS[i], Substitutions[i]));
}
Builder.CreateBr(ExitBB);
Builder.SetInsertPoint(--Builder.GetInsertPoint());
BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
// Add the termination of the ptx-device subfunction.
Builder.SetInsertPoint(ExitBB);
Builder.CreateRetVoid();
Builder.SetInsertPoint(LoopBody);
*SubFunction = FN;
}
void PTXGenerator::startGeneration(SetVector<Value *> &UsedValues,
SetVector<Value *> &OriginalIVS,
ValueToValueMapTy &VMap,
BasicBlock::iterator *LoopBody) {
Function *SubFunction;
BasicBlock::iterator PrevInsertPoint = Builder.GetInsertPoint();
createSubfunction(UsedValues, OriginalIVS, VMap, &SubFunction);
*LoopBody = Builder.GetInsertPoint();
Builder.SetInsertPoint(PrevInsertPoint);
}
IntegerType *PTXGenerator::getInt64Type() { return Builder.getInt64Ty(); }
PointerType *PTXGenerator::getI8PtrType() {
return PointerType::getUnqual(Builder.getInt8Ty());
}
PointerType *PTXGenerator::getPtrI8PtrType() {
return PointerType::getUnqual(getI8PtrType());
}
PointerType *PTXGenerator::getFloatPtrType() {
return llvm::Type::getFloatPtrTy(getModule()->getContext());
}
PointerType *PTXGenerator::getGPUContextPtrType() {
return PointerType::getUnqual(ContextTy);
}
PointerType *PTXGenerator::getGPUModulePtrType() {
return PointerType::getUnqual(ModuleTy);
}
PointerType *PTXGenerator::getGPUDevicePtrType() {
return PointerType::getUnqual(DeviceTy);
}
PointerType *PTXGenerator::getPtrGPUDevicePtrType() {
return PointerType::getUnqual(DevDataTy);
}
PointerType *PTXGenerator::getGPUFunctionPtrType() {
return PointerType::getUnqual(KernelTy);
}
PointerType *PTXGenerator::getGPUEventPtrType() {
return PointerType::getUnqual(EventTy);
}
void PTXGenerator::InitializeGPUDataTypes() {
LLVMContext &Context = getModule()->getContext();
ContextTy = StructType::create(Context, "struct.PollyGPUContextT");
ModuleTy = StructType::create(Context, "struct.PollyGPUModuleT");
KernelTy = StructType::create(Context, "struct.PollyGPUFunctionT");
DeviceTy = StructType::create(Context, "struct.PollyGPUDeviceT");
DevDataTy = StructType::create(Context, "struct.PollyGPUDevicePtrT");
EventTy = StructType::create(Context, "struct.PollyGPUEventT");
}
void PTXGenerator::createCallInitDevice(Value *Context, Value *Device) {
const char *Name = "polly_initDevice";
Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(PointerType::getUnqual(getGPUContextPtrType()));
Args.push_back(PointerType::getUnqual(getGPUDevicePtrType()));
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall2(F, Context, Device);
}
void PTXGenerator::createCallGetPTXModule(Value *Buffer, Value *Module) {
const char *Name = "polly_getPTXModule";
llvm::Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getI8PtrType());
Args.push_back(PointerType::getUnqual(getGPUModulePtrType()));
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall2(F, Buffer, Module);
}
void PTXGenerator::createCallGetPTXKernelEntry(Value *Entry, Value *Module,
Value *Kernel) {
const char *Name = "polly_getPTXKernelEntry";
llvm::Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getI8PtrType());
Args.push_back(getGPUModulePtrType());
Args.push_back(PointerType::getUnqual(getGPUFunctionPtrType()));
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall3(F, Entry, Module, Kernel);
}
void PTXGenerator::createCallAllocateMemoryForHostAndDevice(Value *HostData,
Value *DeviceData,
Value *Size) {
const char *Name = "polly_allocateMemoryForHostAndDevice";
Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getPtrI8PtrType());
Args.push_back(PointerType::getUnqual(getPtrGPUDevicePtrType()));
Args.push_back(getInt64Type());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall3(F, HostData, DeviceData, Size);
}
void PTXGenerator::createCallCopyFromHostToDevice(Value *DeviceData,
Value *HostData,
Value *Size) {
const char *Name = "polly_copyFromHostToDevice";
Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getPtrGPUDevicePtrType());
Args.push_back(getI8PtrType());
Args.push_back(getInt64Type());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall3(F, DeviceData, HostData, Size);
}
void PTXGenerator::createCallCopyFromDeviceToHost(Value *HostData,
Value *DeviceData,
Value *Size) {
const char *Name = "polly_copyFromDeviceToHost";
Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getI8PtrType());
Args.push_back(getPtrGPUDevicePtrType());
Args.push_back(getInt64Type());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall3(F, HostData, DeviceData, Size);
}
void PTXGenerator::createCallSetKernelParameters(Value *Kernel,
Value *BlockWidth,
Value *BlockHeight,
Value *DeviceData) {
const char *Name = "polly_setKernelParameters";
Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getGPUFunctionPtrType());
Args.push_back(getInt64Type());
Args.push_back(getInt64Type());
Args.push_back(getPtrGPUDevicePtrType());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall4(F, Kernel, BlockWidth, BlockHeight, DeviceData);
}
void PTXGenerator::createCallLaunchKernel(Value *Kernel, Value *GridWidth,
Value *GridHeight) {
const char *Name = "polly_launchKernel";
Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getGPUFunctionPtrType());
Args.push_back(getInt64Type());
Args.push_back(getInt64Type());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall3(F, Kernel, GridWidth, GridHeight);
}
void PTXGenerator::createCallStartTimerByCudaEvent(Value *StartEvent,
Value *StopEvent) {
const char *Name = "polly_startTimerByCudaEvent";
Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
Args.push_back(PointerType::getUnqual(getGPUEventPtrType()));
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall2(F, StartEvent, StopEvent);
}
void PTXGenerator::createCallStopTimerByCudaEvent(Value *StartEvent,
Value *StopEvent,
Value *Timer) {
const char *Name = "polly_stopTimerByCudaEvent";
Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getGPUEventPtrType());
Args.push_back(getGPUEventPtrType());
Args.push_back(getFloatPtrType());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall3(F, StartEvent, StopEvent, Timer);
}
void PTXGenerator::createCallCleanupGPGPUResources(Value *HostData,
Value *DeviceData,
Value *Module,
Value *Context,
Value *Kernel) {
const char *Name = "polly_cleanupGPGPUResources";
llvm::Module *M = getModule();
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
std::vector<Type *> Args;
Args.push_back(getI8PtrType());
Args.push_back(getPtrGPUDevicePtrType());
Args.push_back(getGPUModulePtrType());
Args.push_back(getGPUContextPtrType());
Args.push_back(getGPUFunctionPtrType());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Args, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall5(F, HostData, DeviceData, Module, Context, Kernel);
}
Value *PTXGenerator::getCUDAGridWidth() {
return ConstantInt::get(getInt64Type(), GridWidth);
}
Value *PTXGenerator::getCUDAGridHeight() {
return ConstantInt::get(getInt64Type(), GridHeight);
}
Value *PTXGenerator::getCUDABlockWidth() {
return ConstantInt::get(getInt64Type(), BlockWidth);
}
Value *PTXGenerator::getCUDABlockHeight() {
return ConstantInt::get(getInt64Type(), BlockHeight);
}
Value *PTXGenerator::getOutputArraySizeInBytes() {
return ConstantInt::get(getInt64Type(), OutputBytes);
}
static Module *extractPTXFunctionsFromModule(const Module *M,
const StringRef &Triple) {
llvm::ValueToValueMapTy VMap;
Module *New = new Module("TempGPUModule", M->getContext());
New->setTargetTriple(Triple::normalize(Triple));
// Loop over the functions in the module, making external functions as before
for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
if (!I->isDeclaration() &&
(I->getCallingConv() == CallingConv::PTX_Device ||
I->getCallingConv() == CallingConv::PTX_Kernel)) {
Function *NF =
Function::Create(cast<FunctionType>(I->getType()->getElementType()),
I->getLinkage(), I->getName(), New);
NF->copyAttributesFrom(I);
VMap[I] = NF;
Function::arg_iterator DestI = NF->arg_begin();
for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
++J) {
DestI->setName(J->getName());
VMap[J] = DestI++;
}
SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
CloneFunctionInto(NF, I, VMap, /*ModuleLevelChanges=*/true, Returns);
}
}
return New;
}
static bool createASMAsString(Module *New, const StringRef &Triple,
const StringRef &MCPU, const StringRef &Features,
std::string &ASM) {
llvm::Triple TheTriple(Triple::normalize(Triple));
std::string ErrMsg;
const Target *TheTarget =
TargetRegistry::lookupTarget(TheTriple.getTriple(), ErrMsg);
if (!TheTarget) {
errs() << ErrMsg << "\n";
return false;
}
TargetOptions Options;
std::unique_ptr<TargetMachine> target(TheTarget->createTargetMachine(
TheTriple.getTriple(), MCPU, Features, Options));
assert(target.get() && "Could not allocate target machine!");
TargetMachine &Target = *target.get();
// Build up all of the passes that we want to do to the module.
PassManager PM;
PM.add(new TargetLibraryInfoWrapperPass(TheTriple));
PM.add(new DataLayoutPass(*Target.getDataLayout()));
Target.addAnalysisPasses(PM);
{
raw_string_ostream NameROS(ASM);
formatted_raw_ostream FOS(NameROS);
// Ask the target to add backend passes as necessary.
int UseVerifier = true;
if (Target.addPassesToEmitFile(PM, FOS, TargetMachine::CGFT_AssemblyFile,
UseVerifier)) {
errs() << "The target does not support generation of this file type!\n";
return false;
}
PM.run(*New);
FOS.flush();
}
return true;
}
Value *PTXGenerator::createPTXKernelFunction(Function *SubFunction) {
Module *M = getModule();
Module *GPUModule = extractPTXFunctionsFromModule(M, GPUTriple);
std::string LLVMKernelStr;
if (!createASMAsString(GPUModule, GPUTriple, "sm_20" /*MCPU*/,
"" /*Features*/, LLVMKernelStr)) {
errs() << "Generate ptx string failed!\n";
return NULL;
}
Value *LLVMKernel =
Builder.CreateGlobalStringPtr(LLVMKernelStr, "llvm_kernel");
delete GPUModule;
return LLVMKernel;
}
Value *PTXGenerator::getPTXKernelEntryName(Function *SubFunction) {
StringRef Entry = SubFunction->getName();
return Builder.CreateGlobalStringPtr(Entry, "ptx_entry");
}
void PTXGenerator::eraseUnusedFunctions(Function *SubFunction) {
Module *M = getModule();
SubFunction->eraseFromParent();
if (Function *FuncPTXReadNCtaidX = M->getFunction("llvm.ptx.read.nctaid.x")) {
FuncPTXReadNCtaidX->eraseFromParent();
}
if (Function *FuncPTXReadNCtaidY = M->getFunction("llvm.ptx.read.nctaid.y")) {
FuncPTXReadNCtaidY->eraseFromParent();
}
if (Function *FuncPTXReadCtaidX = M->getFunction("llvm.ptx.read.ctaid.x")) {
FuncPTXReadCtaidX->eraseFromParent();
}
if (Function *FuncPTXReadCtaidY = M->getFunction("llvm.ptx.read.ctaid.y")) {
FuncPTXReadCtaidY->eraseFromParent();
}
if (Function *FuncPTXReadNTidX = M->getFunction("llvm.ptx.read.ntid.x")) {
FuncPTXReadNTidX->eraseFromParent();
}
if (Function *FuncPTXReadNTidY = M->getFunction("llvm.ptx.read.ntid.y")) {
FuncPTXReadNTidY->eraseFromParent();
}
if (Function *FuncPTXReadTidX = M->getFunction("llvm.ptx.read.tid.x")) {
FuncPTXReadTidX->eraseFromParent();
}
if (Function *FuncPTXReadTidY = M->getFunction("llvm.ptx.read.tid.y")) {
FuncPTXReadTidY->eraseFromParent();
}
}
void PTXGenerator::finishGeneration(Function *F) {
// Define data used by the GPURuntime library.
AllocaInst *PtrCUContext =
Builder.CreateAlloca(getGPUContextPtrType(), 0, "phcontext");
AllocaInst *PtrCUDevice =
Builder.CreateAlloca(getGPUDevicePtrType(), 0, "phdevice");
AllocaInst *PtrCUModule =
Builder.CreateAlloca(getGPUModulePtrType(), 0, "phmodule");
AllocaInst *PtrCUKernel =
Builder.CreateAlloca(getGPUFunctionPtrType(), 0, "phkernel");
AllocaInst *PtrCUStartEvent =
Builder.CreateAlloca(getGPUEventPtrType(), 0, "pstart_timer");
AllocaInst *PtrCUStopEvent =
Builder.CreateAlloca(getGPUEventPtrType(), 0, "pstop_timer");
AllocaInst *PtrDevData =
Builder.CreateAlloca(getPtrGPUDevicePtrType(), 0, "pdevice_data");
AllocaInst *PtrHostData =
Builder.CreateAlloca(getI8PtrType(), 0, "phost_data");
Type *FloatTy = llvm::Type::getFloatTy(getModule()->getContext());
AllocaInst *PtrElapsedTimes = Builder.CreateAlloca(FloatTy, 0, "ptimer");
// Initialize the GPU device.
createCallInitDevice(PtrCUContext, PtrCUDevice);
// Create the GPU kernel module and entry function.
Value *PTXString = createPTXKernelFunction(F);
Value *PTXEntry = getPTXKernelEntryName(F);
createCallGetPTXModule(PTXString, PtrCUModule);
LoadInst *CUModule = Builder.CreateLoad(PtrCUModule, "cumodule");
createCallGetPTXKernelEntry(PTXEntry, CUModule, PtrCUKernel);
// Allocate device memory and its corresponding host memory.
createCallAllocateMemoryForHostAndDevice(PtrHostData, PtrDevData,
getOutputArraySizeInBytes());
// Get the pointer to the device memory and set the GPU execution parameters.
LoadInst *DData = Builder.CreateLoad(PtrDevData, "device_data");
LoadInst *CUKernel = Builder.CreateLoad(PtrCUKernel, "cukernel");
createCallSetKernelParameters(CUKernel, getCUDABlockWidth(),
getCUDABlockHeight(), DData);
// Create the start and end timer and record the start time.
createCallStartTimerByCudaEvent(PtrCUStartEvent, PtrCUStopEvent);
// Launch the GPU kernel.
createCallLaunchKernel(CUKernel, getCUDAGridWidth(), getCUDAGridHeight());
// Copy the results back from the GPU to the host.
LoadInst *HData = Builder.CreateLoad(PtrHostData, "host_data");
createCallCopyFromDeviceToHost(HData, DData, getOutputArraySizeInBytes());
// Record the end time.
LoadInst *CUStartEvent = Builder.CreateLoad(PtrCUStartEvent, "start_timer");
LoadInst *CUStopEvent = Builder.CreateLoad(PtrCUStopEvent, "stop_timer");
createCallStopTimerByCudaEvent(CUStartEvent, CUStopEvent, PtrElapsedTimes);
// Cleanup all the resources used.
LoadInst *CUContext = Builder.CreateLoad(PtrCUContext, "cucontext");
createCallCleanupGPGPUResources(HData, DData, CUModule, CUContext, CUKernel);
// Erase the ptx kernel and device subfunctions and ptx intrinsics from
// current module.
eraseUnusedFunctions(F);
}
#endif /* GPU_CODEGEN */