[NVPTX] Add ranges to intrinsic definitions, cleanup NVVMIntrRange (#138338)
Pull the global intrinsic ranges out of NVVMIntrRange and into the intrinsic table-gen definitions. Also improve range inference for cluster SReg intrinsics.
This commit is contained in:
@@ -139,6 +139,19 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
|
||||
// MISC
|
||||
//
|
||||
|
||||
defvar WARP_SIZE = 32;
|
||||
|
||||
// Note: the maximum grid size in the x-dimension is the lower value of 65535
|
||||
// on sm_20. We conservatively use the larger value here as it required for
|
||||
// sm_30+ and also correct for sm_20.
|
||||
defvar MAX_GRID_SIZE_X = 0x7fffffff;
|
||||
defvar MAX_GRID_SIZE_Y = 0xffff;
|
||||
defvar MAX_GRID_SIZE_Z = 0xffff;
|
||||
|
||||
defvar MAX_BLOCK_SIZE_X = 1024;
|
||||
defvar MAX_BLOCK_SIZE_Y = 1024;
|
||||
defvar MAX_BLOCK_SIZE_Z = 64;
|
||||
|
||||
// Helper class that concatenates list elements with
|
||||
// a given separator 'sep' and returns the result.
|
||||
// Handles empty strings.
|
||||
@@ -4747,26 +4760,35 @@ def int_nvvm_sust_p_3d_v4i32_trap
|
||||
|
||||
// Accessing special registers.
|
||||
|
||||
class PTXReadSRegIntrinsicNB_r32
|
||||
: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
|
||||
class PTXReadSRegIntrinsic_r32<string name>
|
||||
: PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
|
||||
class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
|
||||
: DefaultAttrsIntrinsic<[llvm_i32_ty], [],
|
||||
!listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
|
||||
|
||||
multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
|
||||
class PTXReadSRegIntrinsic_r32<string name,
|
||||
list<IntrinsicProperty> properties = []>
|
||||
: PTXReadSRegIntrinsicNB_r32<properties>,
|
||||
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
|
||||
|
||||
multiclass PTXReadSRegIntrinsic_v4i32<string regname,
|
||||
list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
|
||||
assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
|
||||
// FIXME: Do we need the 128-bit integer type version?
|
||||
// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
|
||||
|
||||
// FIXME: Enable this once v4i32 support is enabled in back-end.
|
||||
// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
|
||||
foreach suffix = ["_x", "_y", "_z", "_w"] in
|
||||
def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
|
||||
defvar suffixes = ["_x", "_y", "_z", "_w"];
|
||||
foreach i = !range(suffixes) in
|
||||
def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
|
||||
}
|
||||
|
||||
// Same, but without automatic clang builtins. It will be used for
|
||||
// registers that require particular GPU or PTX version.
|
||||
multiclass PTXReadSRegIntrinsicNB_v4i32 {
|
||||
foreach suffix = ["_x", "_y", "_z", "_w"] in
|
||||
def suffix : PTXReadSRegIntrinsicNB_r32;
|
||||
multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
|
||||
assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
|
||||
defvar suffixes = ["_x", "_y", "_z", "_w"];
|
||||
foreach i = !range(suffixes) in
|
||||
def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
|
||||
}
|
||||
|
||||
class PTXReadSRegIntrinsic_r64<string name>
|
||||
@@ -4782,15 +4804,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
|
||||
: Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
|
||||
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
|
||||
|
||||
defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
|
||||
defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
|
||||
defm int_nvvm_read_ptx_sreg_tid
|
||||
: PTXReadSRegIntrinsic_v4i32<"tid",
|
||||
[[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
|
||||
[Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
|
||||
[Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
|
||||
[Range<RetIndex, 0, 1>]]>;
|
||||
|
||||
defm int_nvvm_read_ptx_sreg_ntid
|
||||
: PTXReadSRegIntrinsic_v4i32<"ntid",
|
||||
[[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
|
||||
[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
|
||||
[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
|
||||
[Range<RetIndex, 0, 1>]]>;
|
||||
|
||||
def int_nvvm_read_ptx_sreg_laneid
|
||||
: PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
|
||||
|
||||
def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
|
||||
def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
|
||||
def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
|
||||
|
||||
defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
|
||||
defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
|
||||
defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
|
||||
[Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
|
||||
[Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
|
||||
[Range<RetIndex, 0, 1>]];
|
||||
|
||||
defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
|
||||
[Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
|
||||
[Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
|
||||
[Range<RetIndex, 0, 1>]];
|
||||
|
||||
defm int_nvvm_read_ptx_sreg_ctaid
|
||||
: PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
|
||||
|
||||
defm int_nvvm_read_ptx_sreg_nctaid
|
||||
: PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
|
||||
|
||||
def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
|
||||
def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
|
||||
@@ -4817,13 +4865,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
|
||||
def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
|
||||
def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
|
||||
|
||||
def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
|
||||
def int_nvvm_read_ptx_sreg_warpsize
|
||||
: PTXReadSRegIntrinsic_r32<"warpsize",
|
||||
[Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
|
||||
|
||||
// sm90+, PTX7.8+
|
||||
defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
|
||||
defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
|
||||
defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
|
||||
defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
|
||||
|
||||
// Note: Since clusters are subdivisions of the grid, we conservatively use the
|
||||
// maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
|
||||
// practice, the clusterid will likely be much smaller. The CUDA programming
|
||||
// guide recommends 8 as a maximum portable value and H100s support 16.
|
||||
|
||||
defm int_nvvm_read_ptx_sreg_clusterid
|
||||
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
|
||||
defm int_nvvm_read_ptx_sreg_nclusterid
|
||||
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
|
||||
defm int_nvvm_read_ptx_sreg_cluster_ctaid
|
||||
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
|
||||
defm int_nvvm_read_ptx_sreg_cluster_nctaid
|
||||
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
|
||||
|
||||
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
|
||||
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
|
||||
|
||||
@@ -330,6 +330,16 @@ std::optional<uint64_t> getOverallReqNTID(const Function &F) {
|
||||
return getVectorProduct(ReqNTID);
|
||||
}
|
||||
|
||||
std::optional<uint64_t> getOverallClusterRank(const Function &F) {
|
||||
// maxclusterrank and cluster_dim are mutually exclusive.
|
||||
if (const auto ClusterRank = getMaxClusterRank(F))
|
||||
return ClusterRank;
|
||||
|
||||
// Note: The semantics here are a bit strange. See getMaxNTID.
|
||||
const auto ClusterDim = getClusterDim(F);
|
||||
return getVectorProduct(ClusterDim);
|
||||
}
|
||||
|
||||
std::optional<unsigned> getMaxClusterRank(const Function &F) {
|
||||
return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
|
||||
}
|
||||
|
||||
@@ -54,6 +54,7 @@ SmallVector<unsigned, 3> getClusterDim(const Function &);
|
||||
|
||||
std::optional<uint64_t> getOverallMaxNTID(const Function &);
|
||||
std::optional<uint64_t> getOverallReqNTID(const Function &);
|
||||
std::optional<uint64_t> getOverallClusterRank(const Function &);
|
||||
|
||||
std::optional<unsigned> getMaxClusterRank(const Function &);
|
||||
std::optional<unsigned> getMinCTASm(const Function &);
|
||||
|
||||
@@ -58,87 +58,89 @@ static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) {
|
||||
}
|
||||
|
||||
static bool runNVVMIntrRange(Function &F) {
|
||||
struct {
|
||||
unsigned x, y, z;
|
||||
} MaxBlockSize, MaxGridSize;
|
||||
struct Vector3 {
|
||||
unsigned X, Y, Z;
|
||||
};
|
||||
|
||||
const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
|
||||
getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
|
||||
// All these annotations are only valid for kernel functions.
|
||||
if (!isKernelFunction(F))
|
||||
return false;
|
||||
|
||||
MaxBlockSize.x = std::min(1024u, MetadataNTID);
|
||||
MaxBlockSize.y = std::min(1024u, MetadataNTID);
|
||||
MaxBlockSize.z = std::min(64u, MetadataNTID);
|
||||
const auto OverallReqNTID = getOverallReqNTID(F);
|
||||
const auto OverallMaxNTID = getOverallMaxNTID(F);
|
||||
const auto OverallClusterRank = getOverallClusterRank(F);
|
||||
|
||||
MaxGridSize.x = 0x7fffffff;
|
||||
MaxGridSize.y = 0xffff;
|
||||
MaxGridSize.z = 0xffff;
|
||||
// If this function lacks any range information, do nothing.
|
||||
if (!(OverallReqNTID || OverallMaxNTID || OverallClusterRank))
|
||||
return false;
|
||||
|
||||
// Go through the calls in this function.
|
||||
bool Changed = false;
|
||||
for (Instruction &I : instructions(F)) {
|
||||
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
|
||||
if (!II)
|
||||
continue;
|
||||
const unsigned FunctionNTID = OverallReqNTID.value_or(
|
||||
OverallMaxNTID.value_or(std::numeric_limits<unsigned>::max()));
|
||||
|
||||
const unsigned FunctionClusterRank =
|
||||
OverallClusterRank.value_or(std::numeric_limits<unsigned>::max());
|
||||
|
||||
const Vector3 MaxBlockSize{std::min(1024u, FunctionNTID),
|
||||
std::min(1024u, FunctionNTID),
|
||||
std::min(64u, FunctionNTID)};
|
||||
|
||||
// We conservatively use the maximum grid size as an upper bound for the
|
||||
// cluster rank.
|
||||
const Vector3 MaxClusterRank{std::min(0x7fffffffu, FunctionClusterRank),
|
||||
std::min(0xffffu, FunctionClusterRank),
|
||||
std::min(0xffffu, FunctionClusterRank)};
|
||||
|
||||
const auto ProccessIntrinsic = [&](IntrinsicInst *II) -> bool {
|
||||
switch (II->getIntrinsicID()) {
|
||||
// Index within block
|
||||
case Intrinsic::nvvm_read_ptx_sreg_tid_x:
|
||||
Changed |= addRangeAttr(0, MaxBlockSize.x, II);
|
||||
break;
|
||||
return addRangeAttr(0, MaxBlockSize.X, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_tid_y:
|
||||
Changed |= addRangeAttr(0, MaxBlockSize.y, II);
|
||||
break;
|
||||
return addRangeAttr(0, MaxBlockSize.Y, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_tid_z:
|
||||
Changed |= addRangeAttr(0, MaxBlockSize.z, II);
|
||||
break;
|
||||
return addRangeAttr(0, MaxBlockSize.Z, II);
|
||||
|
||||
// Block size
|
||||
case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
|
||||
Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II);
|
||||
break;
|
||||
return addRangeAttr(1, MaxBlockSize.X + 1, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
|
||||
Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II);
|
||||
break;
|
||||
return addRangeAttr(1, MaxBlockSize.Y + 1, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
|
||||
Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II);
|
||||
break;
|
||||
return addRangeAttr(1, MaxBlockSize.Z + 1, II);
|
||||
|
||||
// Index within grid
|
||||
case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
|
||||
Changed |= addRangeAttr(0, MaxGridSize.x, II);
|
||||
break;
|
||||
case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
|
||||
Changed |= addRangeAttr(0, MaxGridSize.y, II);
|
||||
break;
|
||||
case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
|
||||
Changed |= addRangeAttr(0, MaxGridSize.z, II);
|
||||
break;
|
||||
// Cluster size
|
||||
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x:
|
||||
return addRangeAttr(0, MaxClusterRank.X, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y:
|
||||
return addRangeAttr(0, MaxClusterRank.Y, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z:
|
||||
return addRangeAttr(0, MaxClusterRank.Z, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x:
|
||||
return addRangeAttr(1, MaxClusterRank.X + 1, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y:
|
||||
return addRangeAttr(1, MaxClusterRank.Y + 1, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z:
|
||||
return addRangeAttr(1, MaxClusterRank.Z + 1, II);
|
||||
|
||||
// Grid size
|
||||
case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
|
||||
Changed |= addRangeAttr(1, MaxGridSize.x + 1, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank:
|
||||
if (OverallClusterRank)
|
||||
return addRangeAttr(0, FunctionClusterRank, II);
|
||||
break;
|
||||
case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
|
||||
Changed |= addRangeAttr(1, MaxGridSize.y + 1, II);
|
||||
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank:
|
||||
if (OverallClusterRank)
|
||||
return addRangeAttr(1, FunctionClusterRank + 1, II);
|
||||
break;
|
||||
case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
|
||||
Changed |= addRangeAttr(1, MaxGridSize.z + 1, II);
|
||||
break;
|
||||
|
||||
// warp size is constant 32.
|
||||
case Intrinsic::nvvm_read_ptx_sreg_warpsize:
|
||||
Changed |= addRangeAttr(32, 32 + 1, II);
|
||||
break;
|
||||
|
||||
// Lane ID is [0..warpsize)
|
||||
case Intrinsic::nvvm_read_ptx_sreg_laneid:
|
||||
Changed |= addRangeAttr(0, 32, II);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// Go through the calls in this function.
|
||||
bool Changed = false;
|
||||
for (Instruction &I : instructions(F))
|
||||
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
|
||||
Changed |= ProccessIntrinsic(II);
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
|
||||
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
|
||||
; RUN: llvm-as < %s | llvm-dis | FileCheck %s --check-prefix=DEFAULT
|
||||
|
||||
define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" {
|
||||
; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
|
||||
@@ -74,10 +75,149 @@ define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" {
|
||||
ret i32 %1
|
||||
}
|
||||
|
||||
define ptx_kernel i32 @test_cluster_ctaid() "nvvm.maxclusterrank"="8" {
|
||||
; CHECK-LABEL: define ptx_kernel i32 @test_cluster_ctaid(
|
||||
; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]]
|
||||
; CHECK-NEXT: ret i32 [[TMP15]]
|
||||
;
|
||||
%1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
|
||||
%2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
|
||||
%3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
|
||||
%4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
|
||||
%5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
|
||||
%6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
|
||||
%7 = add i32 %1, %2
|
||||
%8 = add i32 %7, %3
|
||||
%9 = add i32 %8, %4
|
||||
%10 = add i32 %9, %5
|
||||
%11 = add i32 %10, %6
|
||||
ret i32 %11
|
||||
}
|
||||
|
||||
define ptx_kernel i32 @test_cluster_dim() "nvvm.cluster_dim"="4,4,1" {
|
||||
; CHECK-LABEL: define ptx_kernel i32 @test_cluster_dim(
|
||||
; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]]
|
||||
; CHECK-NEXT: ret i32 [[TMP15]]
|
||||
;
|
||||
%1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
|
||||
%2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
|
||||
%3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
|
||||
%4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
|
||||
%5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
|
||||
%6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
|
||||
%7 = add i32 %1, %2
|
||||
%8 = add i32 %7, %3
|
||||
%9 = add i32 %8, %4
|
||||
%10 = add i32 %9, %5
|
||||
%11 = add i32 %10, %6
|
||||
ret i32 %11
|
||||
}
|
||||
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.tid.w()
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
|
||||
; DEFAULT-DAG: declare noundef range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.clusterid.w()
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w()
|
||||
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
|
||||
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
|
||||
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.tid.w()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.w()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w()
|
||||
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
|
||||
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w()
|
||||
|
||||
@@ -1,13 +1,10 @@
|
||||
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s
|
||||
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \
|
||||
; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s
|
||||
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
|
||||
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
|
||||
|
||||
define ptx_device i32 @test_tid_x() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;
|
||||
; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
ret i32 %x
|
||||
@@ -15,7 +12,6 @@ define ptx_device i32 @test_tid_x() {
|
||||
|
||||
define ptx_device i32 @test_tid_y() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y;
|
||||
; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
||||
ret i32 %x
|
||||
@@ -23,7 +19,6 @@ define ptx_device i32 @test_tid_y() {
|
||||
|
||||
define ptx_device i32 @test_tid_z() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z;
|
||||
; RANGE: call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
||||
ret i32 %x
|
||||
@@ -38,7 +33,6 @@ define ptx_device i32 @test_tid_w() {
|
||||
|
||||
define ptx_device i32 @test_ntid_x() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x;
|
||||
; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
ret i32 %x
|
||||
@@ -46,7 +40,6 @@ define ptx_device i32 @test_ntid_x() {
|
||||
|
||||
define ptx_device i32 @test_ntid_y() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y;
|
||||
; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
||||
ret i32 %x
|
||||
@@ -54,7 +47,6 @@ define ptx_device i32 @test_ntid_y() {
|
||||
|
||||
define ptx_device i32 @test_ntid_z() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z;
|
||||
; RANGE: call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
||||
ret i32 %x
|
||||
@@ -69,7 +61,6 @@ define ptx_device i32 @test_ntid_w() {
|
||||
|
||||
define ptx_device i32 @test_laneid() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %laneid;
|
||||
; RANGE: call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
|
||||
ret i32 %x
|
||||
@@ -77,7 +68,6 @@ define ptx_device i32 @test_laneid() {
|
||||
|
||||
define ptx_device i32 @test_warpsize() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, WARP_SZ;
|
||||
; RANGE: call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
ret i32 %x
|
||||
@@ -99,7 +89,6 @@ define ptx_device i32 @test_nwarpid() {
|
||||
|
||||
define ptx_device i32 @test_ctaid_y() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y;
|
||||
; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
||||
ret i32 %x
|
||||
@@ -107,7 +96,6 @@ define ptx_device i32 @test_ctaid_y() {
|
||||
|
||||
define ptx_device i32 @test_ctaid_z() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z;
|
||||
; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
|
||||
ret i32 %x
|
||||
@@ -115,7 +103,6 @@ define ptx_device i32 @test_ctaid_z() {
|
||||
|
||||
define ptx_device i32 @test_ctaid_x() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x;
|
||||
; RANGE: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
||||
ret i32 %x
|
||||
@@ -130,7 +117,6 @@ define ptx_device i32 @test_ctaid_w() {
|
||||
|
||||
define ptx_device i32 @test_nctaid_y() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y;
|
||||
; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
|
||||
ret i32 %x
|
||||
@@ -138,7 +124,6 @@ define ptx_device i32 @test_nctaid_y() {
|
||||
|
||||
define ptx_device i32 @test_nctaid_z() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z;
|
||||
; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
|
||||
ret i32 %x
|
||||
@@ -146,7 +131,6 @@ define ptx_device i32 @test_nctaid_z() {
|
||||
|
||||
define ptx_device i32 @test_nctaid_x() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
|
||||
; RANGE: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
|
||||
ret i32 %x
|
||||
@@ -154,7 +138,7 @@ define ptx_device i32 @test_nctaid_x() {
|
||||
|
||||
define ptx_device i32 @test_already_has_range_md() {
|
||||
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
|
||||
; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[ALREADY:[0-9]+]]
|
||||
; CHECK: ret;
|
||||
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range !0
|
||||
ret i32 %x
|
||||
}
|
||||
@@ -316,4 +300,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.pm3()
|
||||
declare void @llvm.nvvm.bar.sync(i32 %i)
|
||||
|
||||
!0 = !{i32 0, i32 19}
|
||||
; RANGE-DAG: ![[ALREADY]] = !{i32 0, i32 19}
|
||||
|
||||
Reference in New Issue
Block a user