[NVPTX] Add ranges to intrinsic definitions, cleanup NVVMIntrRange (#138338)

Pull the global intrinsic ranges out of NVVMIntrRange and into the
intrinsic table-gen definitions. Also improve range inference for
cluster SReg intrinsics.
This commit is contained in:
Alex MacLean
2025-05-05 16:22:25 -07:00
committed by GitHub
parent 0bd065dc94
commit 37da5a10f0
6 changed files with 294 additions and 98 deletions

View File

@@ -139,6 +139,19 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
// MISC
//
defvar WARP_SIZE = 32;
// Note: the maximum grid size in the x-dimension is the lower value of 65535
// on sm_20. We conservatively use the larger value here as it required for
// sm_30+ and also correct for sm_20.
defvar MAX_GRID_SIZE_X = 0x7fffffff;
defvar MAX_GRID_SIZE_Y = 0xffff;
defvar MAX_GRID_SIZE_Z = 0xffff;
defvar MAX_BLOCK_SIZE_X = 1024;
defvar MAX_BLOCK_SIZE_Y = 1024;
defvar MAX_BLOCK_SIZE_Z = 64;
// Helper class that concatenates list elements with
// a given separator 'sep' and returns the result.
// Handles empty strings.
@@ -4747,26 +4760,35 @@ def int_nvvm_sust_p_3d_v4i32_trap
// Accessing special registers.
class PTXReadSRegIntrinsicNB_r32
: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
class PTXReadSRegIntrinsic_r32<string name>
: PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
: DefaultAttrsIntrinsic<[llvm_i32_ty], [],
!listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
class PTXReadSRegIntrinsic_r32<string name,
list<IntrinsicProperty> properties = []>
: PTXReadSRegIntrinsicNB_r32<properties>,
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
multiclass PTXReadSRegIntrinsic_v4i32<string regname,
list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
// FIXME: Do we need the 128-bit integer type version?
// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
// FIXME: Enable this once v4i32 support is enabled in back-end.
// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
foreach suffix = ["_x", "_y", "_z", "_w"] in
def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
defvar suffixes = ["_x", "_y", "_z", "_w"];
foreach i = !range(suffixes) in
def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
}
// Same, but without automatic clang builtins. It will be used for
// registers that require particular GPU or PTX version.
multiclass PTXReadSRegIntrinsicNB_v4i32 {
foreach suffix = ["_x", "_y", "_z", "_w"] in
def suffix : PTXReadSRegIntrinsicNB_r32;
multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
defvar suffixes = ["_x", "_y", "_z", "_w"];
foreach i = !range(suffixes) in
def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
}
class PTXReadSRegIntrinsic_r64<string name>
@@ -4782,15 +4804,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
: Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
defm int_nvvm_read_ptx_sreg_tid
: PTXReadSRegIntrinsic_v4i32<"tid",
[[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
[Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
[Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
[Range<RetIndex, 0, 1>]]>;
defm int_nvvm_read_ptx_sreg_ntid
: PTXReadSRegIntrinsic_v4i32<"ntid",
[[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
[Range<RetIndex, 0, 1>]]>;
def int_nvvm_read_ptx_sreg_laneid
: PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
[Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
[Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
[Range<RetIndex, 0, 1>]];
defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
[Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
[Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
[Range<RetIndex, 0, 1>]];
defm int_nvvm_read_ptx_sreg_ctaid
: PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
defm int_nvvm_read_ptx_sreg_nctaid
: PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
@@ -4817,13 +4865,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
def int_nvvm_read_ptx_sreg_warpsize
: PTXReadSRegIntrinsic_r32<"warpsize",
[Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
// sm90+, PTX7.8+
defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
// Note: Since clusters are subdivisions of the grid, we conservatively use the
// maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
// practice, the clusterid will likely be much smaller. The CUDA programming
// guide recommends 8 as a maximum portable value and H100s support 16.
defm int_nvvm_read_ptx_sreg_clusterid
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
defm int_nvvm_read_ptx_sreg_nclusterid
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
defm int_nvvm_read_ptx_sreg_cluster_ctaid
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
defm int_nvvm_read_ptx_sreg_cluster_nctaid
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;

View File

@@ -330,6 +330,16 @@ std::optional<uint64_t> getOverallReqNTID(const Function &F) {
return getVectorProduct(ReqNTID);
}
std::optional<uint64_t> getOverallClusterRank(const Function &F) {
// maxclusterrank and cluster_dim are mutually exclusive.
if (const auto ClusterRank = getMaxClusterRank(F))
return ClusterRank;
// Note: The semantics here are a bit strange. See getMaxNTID.
const auto ClusterDim = getClusterDim(F);
return getVectorProduct(ClusterDim);
}
std::optional<unsigned> getMaxClusterRank(const Function &F) {
return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
}

View File

@@ -54,6 +54,7 @@ SmallVector<unsigned, 3> getClusterDim(const Function &);
std::optional<uint64_t> getOverallMaxNTID(const Function &);
std::optional<uint64_t> getOverallReqNTID(const Function &);
std::optional<uint64_t> getOverallClusterRank(const Function &);
std::optional<unsigned> getMaxClusterRank(const Function &);
std::optional<unsigned> getMinCTASm(const Function &);

View File

@@ -58,87 +58,89 @@ static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) {
}
static bool runNVVMIntrRange(Function &F) {
struct {
unsigned x, y, z;
} MaxBlockSize, MaxGridSize;
struct Vector3 {
unsigned X, Y, Z;
};
const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
// All these annotations are only valid for kernel functions.
if (!isKernelFunction(F))
return false;
MaxBlockSize.x = std::min(1024u, MetadataNTID);
MaxBlockSize.y = std::min(1024u, MetadataNTID);
MaxBlockSize.z = std::min(64u, MetadataNTID);
const auto OverallReqNTID = getOverallReqNTID(F);
const auto OverallMaxNTID = getOverallMaxNTID(F);
const auto OverallClusterRank = getOverallClusterRank(F);
MaxGridSize.x = 0x7fffffff;
MaxGridSize.y = 0xffff;
MaxGridSize.z = 0xffff;
// If this function lacks any range information, do nothing.
if (!(OverallReqNTID || OverallMaxNTID || OverallClusterRank))
return false;
// Go through the calls in this function.
bool Changed = false;
for (Instruction &I : instructions(F)) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
if (!II)
continue;
const unsigned FunctionNTID = OverallReqNTID.value_or(
OverallMaxNTID.value_or(std::numeric_limits<unsigned>::max()));
const unsigned FunctionClusterRank =
OverallClusterRank.value_or(std::numeric_limits<unsigned>::max());
const Vector3 MaxBlockSize{std::min(1024u, FunctionNTID),
std::min(1024u, FunctionNTID),
std::min(64u, FunctionNTID)};
// We conservatively use the maximum grid size as an upper bound for the
// cluster rank.
const Vector3 MaxClusterRank{std::min(0x7fffffffu, FunctionClusterRank),
std::min(0xffffu, FunctionClusterRank),
std::min(0xffffu, FunctionClusterRank)};
const auto ProccessIntrinsic = [&](IntrinsicInst *II) -> bool {
switch (II->getIntrinsicID()) {
// Index within block
case Intrinsic::nvvm_read_ptx_sreg_tid_x:
Changed |= addRangeAttr(0, MaxBlockSize.x, II);
break;
return addRangeAttr(0, MaxBlockSize.X, II);
case Intrinsic::nvvm_read_ptx_sreg_tid_y:
Changed |= addRangeAttr(0, MaxBlockSize.y, II);
break;
return addRangeAttr(0, MaxBlockSize.Y, II);
case Intrinsic::nvvm_read_ptx_sreg_tid_z:
Changed |= addRangeAttr(0, MaxBlockSize.z, II);
break;
return addRangeAttr(0, MaxBlockSize.Z, II);
// Block size
case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II);
break;
return addRangeAttr(1, MaxBlockSize.X + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II);
break;
return addRangeAttr(1, MaxBlockSize.Y + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II);
break;
return addRangeAttr(1, MaxBlockSize.Z + 1, II);
// Index within grid
case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
Changed |= addRangeAttr(0, MaxGridSize.x, II);
break;
case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
Changed |= addRangeAttr(0, MaxGridSize.y, II);
break;
case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
Changed |= addRangeAttr(0, MaxGridSize.z, II);
break;
// Cluster size
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x:
return addRangeAttr(0, MaxClusterRank.X, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y:
return addRangeAttr(0, MaxClusterRank.Y, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z:
return addRangeAttr(0, MaxClusterRank.Z, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x:
return addRangeAttr(1, MaxClusterRank.X + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y:
return addRangeAttr(1, MaxClusterRank.Y + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z:
return addRangeAttr(1, MaxClusterRank.Z + 1, II);
// Grid size
case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
Changed |= addRangeAttr(1, MaxGridSize.x + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank:
if (OverallClusterRank)
return addRangeAttr(0, FunctionClusterRank, II);
break;
case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
Changed |= addRangeAttr(1, MaxGridSize.y + 1, II);
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank:
if (OverallClusterRank)
return addRangeAttr(1, FunctionClusterRank + 1, II);
break;
case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
Changed |= addRangeAttr(1, MaxGridSize.z + 1, II);
break;
// warp size is constant 32.
case Intrinsic::nvvm_read_ptx_sreg_warpsize:
Changed |= addRangeAttr(32, 32 + 1, II);
break;
// Lane ID is [0..warpsize)
case Intrinsic::nvvm_read_ptx_sreg_laneid:
Changed |= addRangeAttr(0, 32, II);
break;
default:
break;
return false;
}
}
return false;
};
// Go through the calls in this function.
bool Changed = false;
for (Instruction &I : instructions(F))
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
Changed |= ProccessIntrinsic(II);
return Changed;
}

View File

@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
; RUN: llvm-as < %s | llvm-dis | FileCheck %s --check-prefix=DEFAULT
define ptx_kernel i32 @test_maxntid() "nvvm.maxntid"="32,1,3" {
; CHECK-LABEL: define ptx_kernel i32 @test_maxntid(
@@ -74,10 +75,149 @@ define ptx_kernel i32 @test_inlined() "nvvm.maxntid"="4" {
ret i32 %1
}
define ptx_kernel i32 @test_cluster_ctaid() "nvvm.maxclusterrank"="8" {
; CHECK-LABEL: define ptx_kernel i32 @test_cluster_ctaid(
; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 8) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 1, 9) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]]
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]]
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]]
; CHECK-NEXT: ret i32 [[TMP15]]
;
%1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
%2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
%3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
%4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
%5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
%6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
%7 = add i32 %1, %2
%8 = add i32 %7, %3
%9 = add i32 %8, %4
%10 = add i32 %9, %5
%11 = add i32 %10, %6
ret i32 %11
}
define ptx_kernel i32 @test_cluster_dim() "nvvm.cluster_dim"="4,4,1" {
; CHECK-LABEL: define ptx_kernel i32 @test_cluster_dim(
; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 16) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 1, 17) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP5]]
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP6]]
; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP7]]
; CHECK-NEXT: ret i32 [[TMP15]]
;
%1 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
%2 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
%3 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
%4 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
%5 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
%6 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
%7 = add i32 %1, %2
%8 = add i32 %7, %3
%9 = add i32 %8, %4
%10 = add i32 %9, %5
%11 = add i32 %10, %6
ret i32 %11
}
; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; DEFAULT-DAG: declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
; DEFAULT-DAG: declare noundef range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.tid.w()
; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
; DEFAULT-DAG: declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
; DEFAULT-DAG: declare noundef range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
; DEFAULT-DAG: declare noundef range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
; DEFAULT-DAG: declare noundef range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.clusterid.w()
; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
; DEFAULT-DAG: declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
; DEFAULT-DAG: declare noundef range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w()
; DEFAULT-DAG: declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
; DEFAULT-DAG: declare noundef range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
; DEFAULT-DAG: declare noundef range(i32 0, 1) i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.tid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
declare i32 @llvm.nvvm.read.ptx.sreg.laneid()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.clusterid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctaid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.x()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.y()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.z()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctaid.w()

View File

@@ -1,13 +1,10 @@
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \
; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
define ptx_device i32 @test_tid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;
; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %x
@@ -15,7 +12,6 @@ define ptx_device i32 @test_tid_x() {
define ptx_device i32 @test_tid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y;
; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %x
@@ -23,7 +19,6 @@ define ptx_device i32 @test_tid_y() {
define ptx_device i32 @test_tid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z;
; RANGE: call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
ret i32 %x
@@ -38,7 +33,6 @@ define ptx_device i32 @test_tid_w() {
define ptx_device i32 @test_ntid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x;
; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %x
@@ -46,7 +40,6 @@ define ptx_device i32 @test_ntid_x() {
define ptx_device i32 @test_ntid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y;
; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %x
@@ -54,7 +47,6 @@ define ptx_device i32 @test_ntid_y() {
define ptx_device i32 @test_ntid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z;
; RANGE: call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
ret i32 %x
@@ -69,7 +61,6 @@ define ptx_device i32 @test_ntid_w() {
define ptx_device i32 @test_laneid() {
; CHECK: mov.u32 %r{{[0-9]+}}, %laneid;
; RANGE: call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
ret i32 %x
@@ -77,7 +68,6 @@ define ptx_device i32 @test_laneid() {
define ptx_device i32 @test_warpsize() {
; CHECK: mov.u32 %r{{[0-9]+}}, WARP_SZ;
; RANGE: call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
ret i32 %x
@@ -99,7 +89,6 @@ define ptx_device i32 @test_nwarpid() {
define ptx_device i32 @test_ctaid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y;
; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %x
@@ -107,7 +96,6 @@ define ptx_device i32 @test_ctaid_y() {
define ptx_device i32 @test_ctaid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z;
; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
ret i32 %x
@@ -115,7 +103,6 @@ define ptx_device i32 @test_ctaid_z() {
define ptx_device i32 @test_ctaid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x;
; RANGE: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %x
@@ -130,7 +117,6 @@ define ptx_device i32 @test_ctaid_w() {
define ptx_device i32 @test_nctaid_y() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y;
; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
ret i32 %x
@@ -138,7 +124,6 @@ define ptx_device i32 @test_nctaid_y() {
define ptx_device i32 @test_nctaid_z() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z;
; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
ret i32 %x
@@ -146,7 +131,6 @@ define ptx_device i32 @test_nctaid_z() {
define ptx_device i32 @test_nctaid_x() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
; RANGE: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
ret i32 %x
@@ -154,7 +138,7 @@ define ptx_device i32 @test_nctaid_x() {
define ptx_device i32 @test_already_has_range_md() {
; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[ALREADY:[0-9]+]]
; CHECK: ret;
%x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range !0
ret i32 %x
}
@@ -316,4 +300,3 @@ declare i32 @llvm.nvvm.read.ptx.sreg.pm3()
declare void @llvm.nvvm.bar.sync(i32 %i)
!0 = !{i32 0, i32 19}
; RANGE-DAG: ![[ALREADY]] = !{i32 0, i32 19}