[AMDGPU] Adding the amdgpu_num_work_groups function attribute (#79035)

A new function attribute named amdgpu_num_work_groups is added. This
attribute, which consists of three integers, allows programmers to let
the compiler know the number of workgroups to be launched in each of the
three dimensions and do optimizations based on that information.

---------

Co-authored-by: Jun Wang <jun.wang7@amd.com>
This commit is contained in:
Jun Wang
2024-03-12 10:30:39 -07:00
committed by GitHub
parent 93503aafcd
commit c4e517f59c
21 changed files with 628 additions and 1 deletions

View File

@@ -194,6 +194,12 @@ Removed Compiler Flags
Attribute Changes in Clang
--------------------------
- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or
``[[clang::amdgpu_max_num_work_groups(x, y, z)]]`` for the AMDGPU target. This attribute can be
attached to HIP or OpenCL kernel function definitions to provide an optimization hint. The parameters
``x``, ``y``, and ``z`` specify the maximum number of workgroups for the respective dimensions,
and each must be a positive integer when provided. The parameter ``x`` is required, while ``y`` and
``z`` are optional with default value of 1.
Improvements to Clang's diagnostics
-----------------------------------

View File

@@ -2054,6 +2054,13 @@ def AMDGPUNumVGPR : InheritableAttr {
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
}
def AMDGPUMaxNumWorkGroups : InheritableAttr {
let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
let Args = [ExprArgument<"MaxNumWorkGroupsX">, ExprArgument<"MaxNumWorkGroupsY", 1>, ExprArgument<"MaxNumWorkGroupsZ", 1>];
let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
}
def AMDGPUKernelCall : DeclOrTypeAttr {
let Spellings = [Clang<"amdgpu_kernel">];
let Documentation = [Undocumented];

View File

@@ -2741,6 +2741,33 @@ An error will be given if:
}];
}
def AMDGPUMaxNumWorkGroupsDocs : Documentation {
let Category = DocCatAMDGPUAttributes;
let Content = [{
This attribute specifies the max number of work groups when the kernel
is dispatched.
Clang supports the
``__attribute__((amdgpu_max_num_work_groups(<x>, <y>, <z>)))`` or
``[[clang::amdgpu_max_num_work_groups(<x>, <y>, <z>)]]`` attribute for the
AMDGPU target. This attribute may be attached to HIP or OpenCL kernel function
definitions and is an optimization hint.
The ``<x>`` parameter specifies the maximum number of work groups in the x dimension.
Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
Each of the three values must be greater than 0 when provided. The ``<x>`` parameter
is required, while ``<y>`` and ``<z>`` are optional with default value of 1.
If specified, the AMDGPU target backend might be able to produce better machine
code.
An error will be given if:
- Specified values violate subtarget specifications;
- Specified values are not compatible with values provided through other
attributes.
}];
}
def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
let Content = [{
Clang supports several different calling conventions, depending on the target

View File

@@ -3911,6 +3911,16 @@ public:
void addAMDGPUWavesPerEUAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *Min, Expr *Max);
/// Create an AMDGPUMaxNumWorkGroupsAttr attribute.
AMDGPUMaxNumWorkGroupsAttr *
CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI, Expr *XExpr,
Expr *YExpr, Expr *ZExpr);
/// addAMDGPUMaxNumWorkGroupsAttr - Adds an amdgpu_max_num_work_groups
/// attribute to a particular declaration.
void addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *XExpr, Expr *YExpr, Expr *ZExpr);
DLLImportAttr *mergeDLLImportAttr(Decl *D, const AttributeCommonInfo &CI);
DLLExportAttr *mergeDLLExportAttr(Decl *D, const AttributeCommonInfo &CI);
MSInheritanceAttr *mergeMSInheritanceAttr(Decl *D,

View File

@@ -356,6 +356,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
if (NumVGPR != 0)
F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
}
if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
uint32_t X = Attr->getMaxNumWorkGroupsX()
->EvaluateKnownConstInt(M.getContext())
.getExtValue();
// Y and Z dimensions default to 1 if not specified
uint32_t Y = Attr->getMaxNumWorkGroupsY()
? Attr->getMaxNumWorkGroupsY()
->EvaluateKnownConstInt(M.getContext())
.getExtValue()
: 1;
uint32_t Z = Attr->getMaxNumWorkGroupsZ()
? Attr->getMaxNumWorkGroupsZ()
->EvaluateKnownConstInt(M.getContext())
.getExtValue()
: 1;
llvm::SmallString<32> AttrVal;
llvm::raw_svector_ostream OS(AttrVal);
OS << X << ',' << Y << ',' << Z;
F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
}
}
/// Emits control constants used to change per-architecture behaviour in the

View File

@@ -8079,6 +8079,65 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
}
static bool
checkAMDGPUMaxNumWorkGroupsArguments(Sema &S, Expr *XExpr, Expr *YExpr,
Expr *ZExpr,
const AMDGPUMaxNumWorkGroupsAttr &Attr) {
if (S.DiagnoseUnexpandedParameterPack(XExpr) ||
(YExpr && S.DiagnoseUnexpandedParameterPack(YExpr)) ||
(ZExpr && S.DiagnoseUnexpandedParameterPack(ZExpr)))
return true;
// Accept template arguments for now as they depend on something else.
// We'll get to check them when they eventually get instantiated.
if (XExpr->isValueDependent() || (YExpr && YExpr->isValueDependent()) ||
(ZExpr && ZExpr->isValueDependent()))
return false;
uint32_t NumWG = 0;
Expr *Exprs[3] = {XExpr, YExpr, ZExpr};
for (int i = 0; i < 3; i++) {
if (Exprs[i]) {
if (!checkUInt32Argument(S, Attr, Exprs[i], NumWG, i,
/*StrictlyUnsigned=*/true))
return true;
if (NumWG == 0) {
S.Diag(Attr.getLoc(), diag::err_attribute_argument_is_zero)
<< &Attr << Exprs[i]->getSourceRange();
return true;
}
}
}
return false;
}
AMDGPUMaxNumWorkGroupsAttr *
Sema::CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI,
Expr *XExpr, Expr *YExpr, Expr *ZExpr) {
AMDGPUMaxNumWorkGroupsAttr TmpAttr(Context, CI, XExpr, YExpr, ZExpr);
if (checkAMDGPUMaxNumWorkGroupsArguments(*this, XExpr, YExpr, ZExpr, TmpAttr))
return nullptr;
return ::new (Context)
AMDGPUMaxNumWorkGroupsAttr(Context, CI, XExpr, YExpr, ZExpr);
}
void Sema::addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
Expr *XExpr, Expr *YExpr,
Expr *ZExpr) {
if (auto *Attr = CreateAMDGPUMaxNumWorkGroupsAttr(CI, XExpr, YExpr, ZExpr))
D->addAttr(Attr);
}
static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
Expr *YExpr = (AL.getNumArgs() > 1) ? AL.getArgAsExpr(1) : nullptr;
Expr *ZExpr = (AL.getNumArgs() > 2) ? AL.getArgAsExpr(2) : nullptr;
S.addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr);
}
static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
const ParsedAttr &AL) {
// If we try to apply it to a function pointer, don't warn, but don't
@@ -9183,6 +9242,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_AMDGPUNumVGPR:
handleAMDGPUNumVGPRAttr(S, D, AL);
break;
case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
break;
case ParsedAttr::AT_AVRSignal:
handleAVRSignalAttr(S, D, AL);
break;

View File

@@ -607,6 +607,29 @@ static void instantiateDependentAMDGPUWavesPerEUAttr(
S.addAMDGPUWavesPerEUAttr(New, Attr, MinExpr, MaxExpr);
}
static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
const AMDGPUMaxNumWorkGroupsAttr &Attr, Decl *New) {
EnterExpressionEvaluationContext Unevaluated(
S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
if (!ResultX.isUsable())
return;
ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
if (!ResultY.isUsable())
return;
ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
if (!ResultZ.isUsable())
return;
Expr *XExpr = ResultX.getAs<Expr>();
Expr *YExpr = ResultY.getAs<Expr>();
Expr *ZExpr = ResultZ.getAs<Expr>();
S.addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
}
// This doesn't take any template parameters, but we have a custom action that
// needs to happen when the kernel itself is instantiated. We need to run the
// ItaniumMangler to mark the names required to name this kernel.
@@ -792,6 +815,12 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
*AMDGPUFlatWorkGroupSize, New);
}
if (const auto *AMDGPUMaxNumWorkGroups =
dyn_cast<AMDGPUMaxNumWorkGroupsAttr>(TmplAttr)) {
instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
*this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
}
if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
New);

View File

@@ -40,12 +40,45 @@ __attribute__((amdgpu_num_vgpr(64))) // expected-no-diagnostics
__global__ void num_vgpr_64() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z11num_vgpr_64v() [[NUM_VGPR_64:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(32, 4, 2))) // expected-no-diagnostics
__global__ void max_num_work_groups_32_4_2() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z26max_num_work_groups_32_4_2v() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(32))) // expected-no-diagnostics
__global__ void max_num_work_groups_32() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z22max_num_work_groups_32v() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(32,1))) // expected-no-diagnostics
__global__ void max_num_work_groups_32_1() {
// CHECK: define{{.*}} amdgpu_kernel void @_Z24max_num_work_groups_32_1v() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
}
template<unsigned a>
__attribute__((amdgpu_max_num_work_groups(a, 4, 2)))
__global__ void template_a_4_2_max_num_work_groups() {}
template __global__ void template_a_4_2_max_num_work_groups<32>();
// CHECK: define{{.*}} amdgpu_kernel void @_Z34template_a_4_2_max_num_work_groupsILj32EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
template<unsigned a>
__attribute__((amdgpu_max_num_work_groups(32, a, 2)))
__global__ void template_32_a_2_max_num_work_groups() {}
template __global__ void template_32_a_2_max_num_work_groups<4>();
// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_a_2_max_num_work_groupsILj4EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
template<unsigned a>
__attribute__((amdgpu_max_num_work_groups(32, 4, a)))
__global__ void template_32_4_a_max_num_work_groups() {}
template __global__ void template_32_4_a_max_num_work_groups<2>();
// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_4_a_max_num_work_groupsILj2EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
// Make sure this is silently accepted on other targets.
// NAMD-NOT: "amdgpu-flat-work-group-size"
// NAMD-NOT: "amdgpu-waves-per-eu"
// NAMD-NOT: "amdgpu-num-vgpr"
// NAMD-NOT: "amdgpu-num-sgpr"
// NAMD-NOT: "amdgpu-max-num-work-groups"
// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
@@ -53,5 +86,7 @@ __global__ void num_vgpr_64() {
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_4_2]] = {{.*}}"amdgpu-max-num-workgroups"="32,4,2"
// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_1_1]] = {{.*}}"amdgpu-max-num-workgroups"="32,1,1"
// NOUB-NOT: "uniform-work-group-size"="true"

View File

@@ -139,6 +139,46 @@ kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {
// CHECK: define{{.*}} amdgpu_kernel void @reqd_work_group_size_32_2_1_flat_work_group_size_16_128() [[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(1, 1, 1))) // expected-no-diagnostics
kernel void max_num_work_groups_1_1_1() {
// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_1() [[MAX_NUM_WORK_GROUPS_1_1_1:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(32, 1, 1))) // expected-no-diagnostics
kernel void max_num_work_groups_32_1_1() {
// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(32, 8, 1))) // expected-no-diagnostics
kernel void max_num_work_groups_32_8_1() {
// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_8_1() [[MAX_NUM_WORK_GROUPS_32_8_1:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(1, 1, 32))) // expected-no-diagnostics
kernel void max_num_work_groups_1_1_32() {
// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_32() [[MAX_NUM_WORK_GROUPS_1_1_32:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(1, 8, 32))) // expected-no-diagnostics
kernel void max_num_work_groups_1_8_32() {
// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_8_32() [[MAX_NUM_WORK_GROUPS_1_8_32:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(4, 8, 32))) // expected-no-diagnostics
kernel void max_num_work_groups_4_8_32() {
// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_4_8_32() [[MAX_NUM_WORK_GROUPS_4_8_32:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(32))) // expected-no-diagnostics
kernel void max_num_work_groups_32() {
// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
}
__attribute__((amdgpu_max_num_work_groups(32,1))) // expected-no-diagnostics
kernel void max_num_work_groups_32_1() {
// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
}
void a_function() {
// CHECK: define{{.*}} void @a_function() [[A_FUNCTION:#[0-9]+]]
}
@@ -189,5 +229,12 @@ kernel void default_kernel() {
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_1_1]] = {{.*}} "amdgpu-max-num-workgroups"="1,1,1"
// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_1_1]] = {{.*}} "amdgpu-max-num-workgroups"="32,1,1"
// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_8_1]] = {{.*}} "amdgpu-max-num-workgroups"="32,8,1"
// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_1_32]] = {{.*}} "amdgpu-max-num-workgroups"="1,1,32"
// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_8_32]] = {{.*}} "amdgpu-max-num-workgroups"="1,8,32"
// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_4_8_32]] = {{.*}} "amdgpu-max-num-workgroups"="4,8,32"
// CHECK-DAG: attributes [[A_FUNCTION]] = {{.*}}
// CHECK-DAG: attributes [[DEFAULT_KERNEL_ATTRS]] = {{.*}} "amdgpu-flat-work-group-size"="1,256"

View File

@@ -4,6 +4,7 @@
// CHECK: #pragma clang attribute supports the following attributes:
// CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
// CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
// CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
// CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)

View File

@@ -63,6 +63,16 @@ __global__ void flat_work_group_size_32_64_waves_per_eu_2_num_sgpr_32_num_vgpr_6
__attribute__((amdgpu_flat_work_group_size(32, 64), amdgpu_waves_per_eu(2, 4), amdgpu_num_sgpr(32), amdgpu_num_vgpr(64)))
__global__ void flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() {}
__attribute__((amdgpu_max_num_work_groups(32, 1, 1)))
__global__ void max_num_work_groups_32_1_1() {}
__attribute__((amdgpu_max_num_work_groups(32, 1, 1), amdgpu_flat_work_group_size(32, 64)))
__global__ void max_num_work_groups_32_1_1_flat_work_group_size_32_64() {}
__attribute__((amdgpu_max_num_work_groups(32, 1, 1), amdgpu_flat_work_group_size(32, 64), amdgpu_waves_per_eu(2, 4), amdgpu_num_sgpr(32), amdgpu_num_vgpr(64)))
__global__ void max_num_work_groups_32_1_1_flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() {}
// expected-error@+2{{attribute 'reqd_work_group_size' can only be applied to an OpenCL kernel function}}
__attribute__((reqd_work_group_size(32, 64, 64)))
__global__ void reqd_work_group_size_32_64_64() {}
@@ -194,3 +204,125 @@ __global__ void non_cexpr_waves_per_eu_2() {}
// expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}}
__attribute__((amdgpu_waves_per_eu(2, ipow2(2))))
__global__ void non_cexpr_waves_per_eu_2_4() {}
__attribute__((amdgpu_max_num_work_groups(32)))
__global__ void max_num_work_groups_32() {}
__attribute__((amdgpu_max_num_work_groups(32, 1)))
__global__ void max_num_work_groups_32_1() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute takes no more than 3 arguments}}
__attribute__((amdgpu_max_num_work_groups(32, 1, 1, 1)))
__global__ void max_num_work_groups_32_1_1_1() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute takes at least 1 argument}}
__attribute__((amdgpu_max_num_work_groups()))
__global__ void max_num_work_groups_no_arg() {}
// expected-error@+1{{expected expression}}
__attribute__((amdgpu_max_num_work_groups(,1,1)))
__global__ void max_num_work_groups_empty_1_1() {}
// expected-error@+1{{expected expression}}
__attribute__((amdgpu_max_num_work_groups(32,,1)))
__global__ void max_num_work_groups_32_empty_1() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
__attribute__((amdgpu_max_num_work_groups(ipow2(5), 1, 1)))
__global__ void max_num_work_groups_32_1_1_non_int_arg0() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
__attribute__((amdgpu_max_num_work_groups(32, "1", 1)))
__global__ void max_num_work_groups_32_1_1_non_int_arg1() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
__attribute__((amdgpu_max_num_work_groups(-32, 1, 1)))
__global__ void max_num_work_groups_32_1_1_neg_int_arg0() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
__attribute__((amdgpu_max_num_work_groups(32, -1, 1)))
__global__ void max_num_work_groups_32_1_1_neg_int_arg1() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
__attribute__((amdgpu_max_num_work_groups(32, 1, -1)))
__global__ void max_num_work_groups_32_1_1_neg_int_arg2() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
__attribute__((amdgpu_max_num_work_groups(0, 1, 1)))
__global__ void max_num_work_groups_0_1_1() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
__attribute__((amdgpu_max_num_work_groups(32, 0, 1)))
__global__ void max_num_work_groups_32_0_1() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
__attribute__((amdgpu_max_num_work_groups(32, 1, 0)))
__global__ void max_num_work_groups_32_1_0() {}
__attribute__((amdgpu_max_num_work_groups(4294967295)))
__global__ void max_num_work_groups_max_unsigned_int() {}
// expected-error@+1{{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
__attribute__((amdgpu_max_num_work_groups(4294967296)))
__global__ void max_num_work_groups_max_unsigned_int_plus1() {}
// expected-error@+1{{integer constant expression evaluates to value 10000000000 that cannot be represented in a 32-bit unsigned integer type}}
__attribute__((amdgpu_max_num_work_groups(10000000000)))
__global__ void max_num_work_groups_too_large() {}
int num_wg_x = 32;
int num_wg_y = 1;
int num_wg_z = 1;
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
__attribute__((amdgpu_max_num_work_groups(num_wg_x, 1, 1)))
__global__ void max_num_work_groups_32_1_1_non_const_arg0() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
__attribute__((amdgpu_max_num_work_groups(32, num_wg_y, 1)))
__global__ void max_num_work_groups_32_1_1_non_const_arg1() {}
// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}}
__attribute__((amdgpu_max_num_work_groups(32, 1, num_wg_z)))
__global__ void max_num_work_groups_32_1_1_non_const_arg2() {}
const int c_num_wg_x = 32;
__attribute__((amdgpu_max_num_work_groups(c_num_wg_x, 1, 1)))
__global__ void max_num_work_groups_32_1_1_const_arg0() {}
template<unsigned a>
__attribute__((amdgpu_max_num_work_groups(a, 1, 1)))
__global__ void template_a_1_1_max_num_work_groups() {}
template __global__ void template_a_1_1_max_num_work_groups<32>();
template<unsigned a>
__attribute__((amdgpu_max_num_work_groups(32, a, 1)))
__global__ void template_32_a_1_max_num_work_groups() {}
template __global__ void template_32_a_1_max_num_work_groups<1>();
template<unsigned a>
__attribute__((amdgpu_max_num_work_groups(32, 1, a)))
__global__ void template_32_1_a_max_num_work_groups() {}
template __global__ void template_32_1_a_max_num_work_groups<1>();
// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
// expected-note@+4{{in instantiation of}}
template<unsigned b>
__attribute__((amdgpu_max_num_work_groups(b, 1, 1)))
__global__ void template_b_1_1_max_num_work_groups() {}
template __global__ void template_b_1_1_max_num_work_groups<0>();
// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
// expected-note@+4{{in instantiation of}}
template<unsigned b>
__attribute__((amdgpu_max_num_work_groups(32, b, 1)))
__global__ void template_32_b_1_max_num_work_groups() {}
template __global__ void template_32_b_1_max_num_work_groups<0>();
// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
// expected-note@+4{{in instantiation of}}
template<unsigned b>
__attribute__((amdgpu_max_num_work_groups(32, 1, b)))
__global__ void template_32_1_b_max_num_work_groups() {}
template __global__ void template_32_1_b_max_num_work_groups<0>();

View File

@@ -1442,6 +1442,11 @@ The AMDGPU backend supports the following LLVM IR attributes.
the frame. This is an internal detail of how LDS variables are lowered,
language front ends should not set this attribute.
"amdgpu-max-num-workgroups"="x,y,z" Specify the maximum number of work groups for the kernel dispatch in the
X, Y, and Z dimensions. Generated by the ``amdgpu_max_num_work_groups``
CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all
the three numbers are >= 1.
======================================= ==========================================================
Calling Conventions
@@ -3917,6 +3922,11 @@ same *vendor-name*.
If omitted, "normal" is
assumed.
".max_num_work_groups_{x,y,z}" integer The max number of
launched work-groups
in the X, Y, and Z
dimensions. Each number
must be >=1.
=================================== ============== ========= ================================
..

View File

@@ -494,6 +494,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern[".max_flat_workgroup_size"] =
Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
unsigned NumWGX = MFI.getMaxNumWorkGroupsX();
unsigned NumWGY = MFI.getMaxNumWorkGroupsY();
unsigned NumWGZ = MFI.getMaxNumWorkGroupsZ();
if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
Kern[".max_num_workgroups_x"] = Kern.getDocument()->getNode(NumWGX);
Kern[".max_num_workgroups_y"] = Kern.getDocument()->getNode(NumWGY);
Kern[".max_num_workgroups_z"] = Kern.getDocument()->getNode(NumWGZ);
}
Kern[".sgpr_spill_count"] =
Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
Kern[".vgpr_spill_count"] =

View File

@@ -432,7 +432,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
// If minimum/maximum flat work group sizes were explicitly requested using
// "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
// "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
// number of waves per execution unit to values implied by requested
// minimum/maximum flat work group sizes.
unsigned MinImpliedByFlatWorkGroupSize =
@@ -1108,3 +1108,8 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
}
SmallVector<unsigned>
AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
}

View File

@@ -288,6 +288,9 @@ public:
/// 2) dimension.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
/// Return the number of work groups for the function.
SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
/// Return true if only a single workitem can be active in a wave.
bool isSingleLaneExecution(const Function &Kernel) const;

View File

@@ -46,6 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
assert(MaxNumWorkGroups.size() == 3);
Occupancy = ST.computeOccupancy(F, getLDSSize());
CallingConv::ID CC = F.getCallingConv();

View File

@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;
// Default/requested number of work groups for the function.
SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
private:
unsigned NumUserSGPRs = 0;
unsigned NumSystemSGPRs = 0;
@@ -1072,6 +1075,13 @@ public:
// \returns true if a function needs or may need AGPRs.
bool usesAGPRs(const MachineFunction &MF) const;
/// \returns Default/requested number of work groups for this function.
SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
};
} // end namespace llvm

View File

@@ -11,6 +11,7 @@
#include "AMDGPUAsmUtils.h"
#include "AMDKernelCodeT.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constants.h"
@@ -1298,6 +1299,42 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
return Ints;
}
SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
unsigned Size) {
assert(Size > 2);
SmallVector<unsigned> Default(Size, 0);
Attribute A = F.getFnAttribute(Name);
if (!A.isStringAttribute())
return Default;
SmallVector<unsigned> Vals(Size, 0);
LLVMContext &Ctx = F.getContext();
StringRef S = A.getValueAsString();
unsigned i = 0;
for (; !S.empty() && i < Size; i++) {
std::pair<StringRef, StringRef> Strs = S.split(',');
unsigned IntVal;
if (Strs.first.trim().getAsInteger(0, IntVal)) {
Ctx.emitError("can't parse integer attribute " + Strs.first + " in " +
Name);
return Default;
}
Vals[i] = IntVal;
S = Strs.second;
}
if (!S.empty() || i < Size) {
Ctx.emitError("attribute " + Name +
" has incorrect number of integers; expected " +
llvm::utostr(Size));
return Default;
}
return Vals;
}
unsigned getVmcntBitMask(const IsaVersion &Version) {
return (1 << (getVmcntBitWidthLo(Version.Major) +
getVmcntBitWidthHi(Version.Major))) -

View File

@@ -863,6 +863,14 @@ bool isReadOnlySegment(const GlobalValue *GV);
/// target triple \p TT, false otherwise.
bool shouldEmitConstantsToTextSection(const Triple &TT);
/// \returns Integer value requested using \p F's \p Name attribute.
///
/// \returns \p Default if attribute is not present.
///
/// \returns \p Default and emits error if requested value cannot be converted
/// to integer.
int getIntegerAttribute(const Function &F, StringRef Name, int Default);
/// \returns A pair of integer values requested using \p F's \p Name attribute
/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
/// is false).
@@ -877,6 +885,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
std::pair<unsigned, unsigned> Default,
bool OnlyFirstRequired = false);
/// \returns Generate a vector of integer values requested using \p F's \p Name
/// attribute.
///
/// \returns true if exactly Size (>2) number of integers are found in the
/// attribute.
///
/// \returns false if any error occurs.
SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
unsigned Size);
/// Represents the counter values to wait for in an s_waitcnt instruction.
///
/// Large values (including the maximum possible integer) can be used to

View File

@@ -0,0 +1,84 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; Attribute not specified.
; CHECK-LABEL: {{^}}empty_no_attribute:
define amdgpu_kernel void @empty_no_attribute() {
entry:
ret void
}
; Ignore if number of work groups for x dimension is 0.
; CHECK-LABEL: {{^}}empty_max_num_workgroups_x0:
define amdgpu_kernel void @empty_max_num_workgroups_x0() #0 {
entry:
ret void
}
attributes #0 = {"amdgpu-max-num-workgroups"="0,2,3"}
; Ignore if number of work groups for y dimension is 0.
; CHECK-LABEL: {{^}}empty_max_num_workgroups_y0:
define amdgpu_kernel void @empty_max_num_workgroups_y0() #1 {
entry:
ret void
}
attributes #1 = {"amdgpu-max-num-workgroups"="1,0,3"}
; Ignore if number of work groups for z dimension is 0.
; CHECK-LABEL: {{^}}empty_max_num_workgroups_z0:
define amdgpu_kernel void @empty_max_num_workgroups_z0() #2 {
entry:
ret void
}
attributes #2 = {"amdgpu-max-num-workgroups"="1,2,0"}
; CHECK-LABEL: {{^}}empty_max_num_workgroups_1_2_3:
define amdgpu_kernel void @empty_max_num_workgroups_1_2_3() #3 {
entry:
ret void
}
attributes #3 = {"amdgpu-max-num-workgroups"="1,2,3"}
; CHECK-LABEL: {{^}}empty_max_num_workgroups_1024_1024_1024:
define amdgpu_kernel void @empty_max_num_workgroups_1024_1024_1024() #4 {
entry:
ret void
}
attributes #4 = {"amdgpu-max-num-workgroups"="1024,1024,1024"}
; CHECK: .amdgpu_metadata
; CHECK: - .args:
; CHECK: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .name: empty_no_attribute
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK: - .args:
; CHECK: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .name: empty_max_num_workgroups_x0
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK: - .args:
; CHECK: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .name: empty_max_num_workgroups_y0
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK: - .args:
; CHECK: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .name: empty_max_num_workgroups_z0
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK: - .args:
; CHECK: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .max_num_workgroups_x: 1
; CHECK-NEXT: .max_num_workgroups_y: 2
; CHECK-NEXT: .max_num_workgroups_z: 3
; CHECK-NEXT: .name: empty_max_num_workgroups_1_2_3
; CHECK-NEXT: .private_segment_fixed_size: 0
; CHECK: - .args:
; CHECK: .max_flat_workgroup_size: 1024
; CHECK-NEXT: .max_num_workgroups_x: 1024
; CHECK-NEXT: .max_num_workgroups_y: 1024
; CHECK-NEXT: .max_num_workgroups_z: 1024
; CHECK-NEXT: .name: empty_max_num_workgroups_1024_1024_1024
; CHECK-NEXT: .private_segment_fixed_size: 0

View File

@@ -0,0 +1,71 @@
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s 2>&1 | FileCheck --check-prefix=ERROR %s
; ERROR: error: can't parse integer attribute -1 in amdgpu-max-num-workgroups
define amdgpu_kernel void @empty_max_num_workgroups_neg_num1() #21 {
entry:
ret void
}
attributes #21 = {"amdgpu-max-num-workgroups"="-1,2,3"}
; ERROR: error: can't parse integer attribute -2 in amdgpu-max-num-workgroups
define amdgpu_kernel void @empty_max_num_workgroups_neg_num2() #22 {
entry:
ret void
}
attributes #22 = {"amdgpu-max-num-workgroups"="1,-2,3"}
; ERROR: error: can't parse integer attribute -3 in amdgpu-max-num-workgroups
define amdgpu_kernel void @empty_max_num_workgroups_neg_num3() #23 {
entry:
ret void
}
attributes #23 = {"amdgpu-max-num-workgroups"="1,2,-3"}
; ERROR: error: can't parse integer attribute 1.0 in amdgpu-max-num-workgroups
define amdgpu_kernel void @empty_max_num_workgroups_non_int1() #31 {
entry:
ret void
}
attributes #31 = {"amdgpu-max-num-workgroups"="1.0,2,3"}
; ERROR: error: can't parse integer attribute 2.0 in amdgpu-max-num-workgroups
define amdgpu_kernel void @empty_max_num_workgroups_non_int2() #32 {
entry:
ret void
}
attributes #32 = {"amdgpu-max-num-workgroups"="1,2.0,3"}
; ERROR: error: can't parse integer attribute 3.0 in amdgpu-max-num-workgroups
define amdgpu_kernel void @empty_max_num_workgroups_non_int3() #33 {
entry:
ret void
}
attributes #33 = {"amdgpu-max-num-workgroups"="1,2,3.0"}
; ERROR: error: can't parse integer attribute 10000000000 in amdgpu-max-num-workgroups
define amdgpu_kernel void @empty_max_num_workgroups_too_large() #41 {
entry:
ret void
}
attributes #41 = {"amdgpu-max-num-workgroups"="10000000000,2,3"}
; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
define amdgpu_kernel void @empty_max_num_workgroups_1_arg() #51 {
entry:
ret void
}
attributes #51 = {"amdgpu-max-num-workgroups"="1"}
; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
define amdgpu_kernel void @empty_max_num_workgroups_2_args() #52 {
entry:
ret void
}
attributes #52 = {"amdgpu-max-num-workgroups"="1,2"}
; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
define amdgpu_kernel void @empty_max_num_workgroups_4_args() #53 {
entry:
ret void
}
attributes #53 = {"amdgpu-max-num-workgroups"="1,2,3,4"}