[AMDGPU] Adding the amdgpu_num_work_groups function attribute (#79035)

A new function attribute named amdgpu_num_work_groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information. --------- Co-authored-by: Jun Wang <jun.wang7@amd.com>
2024-03-12 10:30:39 -07:00
parent 93503aafcd
commit c4e517f59c
21 changed files with 628 additions and 1 deletions
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -194,6 +194,12 @@ Removed Compiler Flags

 Attribute Changes in Clang
 --------------------------
+- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or
+  ``[[clang::amdgpu_max_num_work_groups(x, y, z)]]`` for the AMDGPU target. This attribute can be
+  attached to HIP or OpenCL kernel function definitions to provide an optimization hint. The parameters
+  ``x``, ``y``, and ``z`` specify the maximum number of workgroups for the respective dimensions,
+  and each must be a positive integer when provided. The parameter ``x`` is required, while ``y`` and
+  ``z`` are optional with default value of 1.

 Improvements to Clang's diagnostics
 -----------------------------------
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2054,6 +2054,13 @@ def AMDGPUNumVGPR : InheritableAttr {
  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }

+def AMDGPUMaxNumWorkGroups : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_max_num_work_groups", 0>];
+  let Args = [ExprArgument<"MaxNumWorkGroupsX">, ExprArgument<"MaxNumWorkGroupsY", 1>, ExprArgument<"MaxNumWorkGroupsZ", 1>];
+  let Documentation = [AMDGPUMaxNumWorkGroupsDocs];
+  let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
+}
+
 def AMDGPUKernelCall : DeclOrTypeAttr {
  let Spellings = [Clang<"amdgpu_kernel">];
  let Documentation = [Undocumented];
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2741,6 +2741,33 @@ An error will be given if:
  }];
 }

+def AMDGPUMaxNumWorkGroupsDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+This attribute specifies the max number of work groups when the kernel
+is dispatched.
+
+Clang supports the
+``__attribute__((amdgpu_max_num_work_groups(<x>, <y>, <z>)))`` or
+``[[clang::amdgpu_max_num_work_groups(<x>, <y>, <z>)]]`` attribute for the
+AMDGPU target. This attribute may be attached to HIP or OpenCL kernel function
+definitions and is an optimization hint.
+
+The ``<x>`` parameter specifies the maximum number of work groups in the x dimension.
+Similarly ``<y>`` and ``<z>`` are for the y and z dimensions respectively.
+Each of the three values must be greater than 0 when provided. The ``<x>`` parameter
+is required, while ``<y>`` and ``<z>`` are optional with default value of 1.
+
+If specified, the AMDGPU target backend might be able to produce better machine
+code.
+
+An error will be given if:
+  - Specified values violate subtarget specifications;
+  - Specified values are not compatible with values provided through other
+    attributes.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
  let Content = [{
 Clang supports several different calling conventions, depending on the target
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -3911,6 +3911,16 @@ public:
  void addAMDGPUWavesPerEUAttr(Decl *D, const AttributeCommonInfo &CI,
                               Expr *Min, Expr *Max);

+  /// Create an AMDGPUMaxNumWorkGroupsAttr attribute.
+  AMDGPUMaxNumWorkGroupsAttr *
+  CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI, Expr *XExpr,
+                                   Expr *YExpr, Expr *ZExpr);
+
+  /// addAMDGPUMaxNumWorkGroupsAttr - Adds an amdgpu_max_num_work_groups
+  /// attribute to a particular declaration.
+  void addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
+                                     Expr *XExpr, Expr *YExpr, Expr *ZExpr);
+
  DLLImportAttr *mergeDLLImportAttr(Decl *D, const AttributeCommonInfo &CI);
  DLLExportAttr *mergeDLLExportAttr(Decl *D, const AttributeCommonInfo &CI);
  MSInheritanceAttr *mergeMSInheritanceAttr(Decl *D,
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -356,6 +356,29 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
    if (NumVGPR != 0)
      F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
  }
+
+  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
+    uint32_t X = Attr->getMaxNumWorkGroupsX()
+                     ->EvaluateKnownConstInt(M.getContext())
+                     .getExtValue();
+    // Y and Z dimensions default to 1 if not specified
+    uint32_t Y = Attr->getMaxNumWorkGroupsY()
+                     ? Attr->getMaxNumWorkGroupsY()
+                           ->EvaluateKnownConstInt(M.getContext())
+                           .getExtValue()
+                     : 1;
+    uint32_t Z = Attr->getMaxNumWorkGroupsZ()
+                     ? Attr->getMaxNumWorkGroupsZ()
+                           ->EvaluateKnownConstInt(M.getContext())
+                           .getExtValue()
+                     : 1;
+
+    llvm::SmallString<32> AttrVal;
+    llvm::raw_svector_ostream OS(AttrVal);
+    OS << X << ',' << Y << ',' << Z;
+
+    F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
+  }
 }

 /// Emits control constants used to change per-architecture behaviour in the
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8079,6 +8079,65 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
  D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR));
 }

+static bool
+checkAMDGPUMaxNumWorkGroupsArguments(Sema &S, Expr *XExpr, Expr *YExpr,
+                                     Expr *ZExpr,
+                                     const AMDGPUMaxNumWorkGroupsAttr &Attr) {
+  if (S.DiagnoseUnexpandedParameterPack(XExpr) ||
+      (YExpr && S.DiagnoseUnexpandedParameterPack(YExpr)) ||
+      (ZExpr && S.DiagnoseUnexpandedParameterPack(ZExpr)))
+    return true;
+
+  // Accept template arguments for now as they depend on something else.
+  // We'll get to check them when they eventually get instantiated.
+  if (XExpr->isValueDependent() || (YExpr && YExpr->isValueDependent()) ||
+      (ZExpr && ZExpr->isValueDependent()))
+    return false;
+
+  uint32_t NumWG = 0;
+  Expr *Exprs[3] = {XExpr, YExpr, ZExpr};
+  for (int i = 0; i < 3; i++) {
+    if (Exprs[i]) {
+      if (!checkUInt32Argument(S, Attr, Exprs[i], NumWG, i,
+                               /*StrictlyUnsigned=*/true))
+        return true;
+      if (NumWG == 0) {
+        S.Diag(Attr.getLoc(), diag::err_attribute_argument_is_zero)
+            << &Attr << Exprs[i]->getSourceRange();
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+AMDGPUMaxNumWorkGroupsAttr *
+Sema::CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI,
+                                       Expr *XExpr, Expr *YExpr, Expr *ZExpr) {
+  AMDGPUMaxNumWorkGroupsAttr TmpAttr(Context, CI, XExpr, YExpr, ZExpr);
+
+  if (checkAMDGPUMaxNumWorkGroupsArguments(*this, XExpr, YExpr, ZExpr, TmpAttr))
+    return nullptr;
+
+  return ::new (Context)
+      AMDGPUMaxNumWorkGroupsAttr(Context, CI, XExpr, YExpr, ZExpr);
+}
+
+void Sema::addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI,
+                                         Expr *XExpr, Expr *YExpr,
+                                         Expr *ZExpr) {
+  if (auto *Attr = CreateAMDGPUMaxNumWorkGroupsAttr(CI, XExpr, YExpr, ZExpr))
+    D->addAttr(Attr);
+}
+
+static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D,
+                                             const ParsedAttr &AL) {
+  Expr *YExpr = (AL.getNumArgs() > 1) ? AL.getArgAsExpr(1) : nullptr;
+  Expr *ZExpr = (AL.getNumArgs() > 2) ? AL.getArgAsExpr(2) : nullptr;
+  S.addAMDGPUMaxNumWorkGroupsAttr(D, AL, AL.getArgAsExpr(0), YExpr, ZExpr);
+}
+
 static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
                                              const ParsedAttr &AL) {
  // If we try to apply it to a function pointer, don't warn, but don't
@@ -9183,6 +9242,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
  case ParsedAttr::AT_AMDGPUNumVGPR:
    handleAMDGPUNumVGPRAttr(S, D, AL);
    break;
+  case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
+    handleAMDGPUMaxNumWorkGroupsAttr(S, D, AL);
+    break;
  case ParsedAttr::AT_AVRSignal:
    handleAVRSignalAttr(S, D, AL);
    break;
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -607,6 +607,29 @@ static void instantiateDependentAMDGPUWavesPerEUAttr(
  S.addAMDGPUWavesPerEUAttr(New, Attr, MinExpr, MaxExpr);
 }

+static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const AMDGPUMaxNumWorkGroupsAttr &Attr, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+  ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs);
+  if (!ResultX.isUsable())
+    return;
+  ExprResult ResultY = S.SubstExpr(Attr.getMaxNumWorkGroupsY(), TemplateArgs);
+  if (!ResultY.isUsable())
+    return;
+  ExprResult ResultZ = S.SubstExpr(Attr.getMaxNumWorkGroupsZ(), TemplateArgs);
+  if (!ResultZ.isUsable())
+    return;
+
+  Expr *XExpr = ResultX.getAs<Expr>();
+  Expr *YExpr = ResultY.getAs<Expr>();
+  Expr *ZExpr = ResultZ.getAs<Expr>();
+
+  S.addAMDGPUMaxNumWorkGroupsAttr(New, Attr, XExpr, YExpr, ZExpr);
+}
+
 // This doesn't take any template parameters, but we have a custom action that
 // needs to happen when the kernel itself is instantiated. We need to run the
 // ItaniumMangler to mark the names required to name this kernel.
@@ -792,6 +815,12 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
                                               *AMDGPUFlatWorkGroupSize, New);
    }

+    if (const auto *AMDGPUMaxNumWorkGroups =
+            dyn_cast<AMDGPUMaxNumWorkGroupsAttr>(TmplAttr)) {
+      instantiateDependentAMDGPUMaxNumWorkGroupsAttr(
+          *this, TemplateArgs, *AMDGPUMaxNumWorkGroups, New);
+    }
+
    if (const auto *ParamAttr = dyn_cast<HLSLParamModifierAttr>(TmplAttr)) {
      instantiateDependentHLSLParamModifierAttr(*this, TemplateArgs, ParamAttr,
                                                New);
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -40,12 +40,45 @@ __attribute__((amdgpu_num_vgpr(64))) // expected-no-diagnostics
 __global__ void num_vgpr_64() {
 // CHECK: define{{.*}} amdgpu_kernel void @_Z11num_vgpr_64v() [[NUM_VGPR_64:#[0-9]+]]
 }
+__attribute__((amdgpu_max_num_work_groups(32, 4, 2))) // expected-no-diagnostics
+__global__ void max_num_work_groups_32_4_2() {
+// CHECK: define{{.*}} amdgpu_kernel void @_Z26max_num_work_groups_32_4_2v() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+}
+__attribute__((amdgpu_max_num_work_groups(32))) // expected-no-diagnostics
+__global__ void max_num_work_groups_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @_Z22max_num_work_groups_32v() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+__attribute__((amdgpu_max_num_work_groups(32,1))) // expected-no-diagnostics
+__global__ void max_num_work_groups_32_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @_Z24max_num_work_groups_32_1v() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
+
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(a, 4, 2)))
+__global__ void template_a_4_2_max_num_work_groups() {}
+template __global__ void template_a_4_2_max_num_work_groups<32>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z34template_a_4_2_max_num_work_groupsILj32EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, a, 2)))
+__global__ void template_32_a_2_max_num_work_groups() {}
+template __global__ void template_32_a_2_max_num_work_groups<4>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_a_2_max_num_work_groupsILj4EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, 4, a)))
+__global__ void template_32_4_a_max_num_work_groups() {}
+template __global__ void template_32_4_a_max_num_work_groups<2>();
+// CHECK: define{{.*}} amdgpu_kernel void @_Z35template_32_4_a_max_num_work_groupsILj2EEvv() [[MAX_NUM_WORK_GROUPS_32_4_2:#[0-9]+]]

 // Make sure this is silently accepted on other targets.
 // NAMD-NOT: "amdgpu-flat-work-group-size"
 // NAMD-NOT: "amdgpu-waves-per-eu"
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
+// NAMD-NOT: "amdgpu-max-num-work-groups"

 // DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
@@ -53,5 +86,7 @@ __global__ void num_vgpr_64() {
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
 // CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_4_2]] = {{.*}}"amdgpu-max-num-workgroups"="32,4,2"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_1_1]] = {{.*}}"amdgpu-max-num-workgroups"="32,1,1"

 // NOUB-NOT: "uniform-work-group-size"="true"
--- a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
@@ -139,6 +139,46 @@ kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() {
 // CHECK: define{{.*}} amdgpu_kernel void @reqd_work_group_size_32_2_1_flat_work_group_size_16_128() [[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]]
 }

+__attribute__((amdgpu_max_num_work_groups(1, 1, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_1_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_1() [[MAX_NUM_WORK_GROUPS_1_1_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_32_1_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32, 8, 1))) // expected-no-diagnostics
+kernel void max_num_work_groups_32_8_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_8_1() [[MAX_NUM_WORK_GROUPS_32_8_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(1, 1, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_1_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_32() [[MAX_NUM_WORK_GROUPS_1_1_32:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(1, 8, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_1_8_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_8_32() [[MAX_NUM_WORK_GROUPS_1_8_32:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(4, 8, 32))) // expected-no-diagnostics
+kernel void max_num_work_groups_4_8_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_4_8_32() [[MAX_NUM_WORK_GROUPS_4_8_32:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32))) // expected-no-diagnostics
+kernel void max_num_work_groups_32() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
+__attribute__((amdgpu_max_num_work_groups(32,1))) // expected-no-diagnostics
+kernel void max_num_work_groups_32_1() {
+// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]]
+}
+
 void a_function() {
 // CHECK: define{{.*}} void @a_function() [[A_FUNCTION:#[0-9]+]]
 }
@@ -189,5 +229,12 @@ kernel void default_kernel() {
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"

+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_1_1]] = {{.*}} "amdgpu-max-num-workgroups"="1,1,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_1_1]] = {{.*}} "amdgpu-max-num-workgroups"="32,1,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_32_8_1]] = {{.*}} "amdgpu-max-num-workgroups"="32,8,1"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_1_32]] = {{.*}} "amdgpu-max-num-workgroups"="1,1,32"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_1_8_32]] = {{.*}} "amdgpu-max-num-workgroups"="1,8,32"
+// CHECK-DAG: attributes [[MAX_NUM_WORK_GROUPS_4_8_32]] = {{.*}} "amdgpu-max-num-workgroups"="4,8,32"
+
 // CHECK-DAG: attributes [[A_FUNCTION]] = {{.*}}
 // CHECK-DAG: attributes [[DEFAULT_KERNEL_ATTRS]] = {{.*}} "amdgpu-flat-work-group-size"="1,256"
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -4,6 +4,7 @@

 // CHECK: #pragma clang attribute supports the following attributes:
 // CHECK-NEXT: AMDGPUFlatWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
--- a/clang/test/SemaCUDA/amdgpu-attrs.cu
+++ b/clang/test/SemaCUDA/amdgpu-attrs.cu
@@ -63,6 +63,16 @@ __global__ void flat_work_group_size_32_64_waves_per_eu_2_num_sgpr_32_num_vgpr_6
 __attribute__((amdgpu_flat_work_group_size(32, 64), amdgpu_waves_per_eu(2, 4), amdgpu_num_sgpr(32), amdgpu_num_vgpr(64)))
 __global__ void flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() {}

+__attribute__((amdgpu_max_num_work_groups(32, 1, 1)))
+__global__ void max_num_work_groups_32_1_1() {}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1), amdgpu_flat_work_group_size(32, 64)))
+__global__ void max_num_work_groups_32_1_1_flat_work_group_size_32_64() {}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1), amdgpu_flat_work_group_size(32, 64), amdgpu_waves_per_eu(2, 4), amdgpu_num_sgpr(32), amdgpu_num_vgpr(64)))
+__global__ void max_num_work_groups_32_1_1_flat_work_group_size_32_64_waves_per_eu_2_4_num_sgpr_32_num_vgpr_64() {}
+
+
 // expected-error@+2{{attribute 'reqd_work_group_size' can only be applied to an OpenCL kernel function}}
 __attribute__((reqd_work_group_size(32, 64, 64)))
 __global__ void reqd_work_group_size_32_64_64() {}
@@ -194,3 +204,125 @@ __global__ void non_cexpr_waves_per_eu_2() {}
 // expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}}
 __attribute__((amdgpu_waves_per_eu(2, ipow2(2))))
 __global__ void non_cexpr_waves_per_eu_2_4() {}
+
+__attribute__((amdgpu_max_num_work_groups(32)))
+__global__ void max_num_work_groups_32() {}
+
+__attribute__((amdgpu_max_num_work_groups(32, 1)))
+__global__ void max_num_work_groups_32_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute takes no more than 3 arguments}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute takes at least 1 argument}}
+__attribute__((amdgpu_max_num_work_groups()))
+__global__ void max_num_work_groups_no_arg() {}
+
+// expected-error@+1{{expected expression}}
+__attribute__((amdgpu_max_num_work_groups(,1,1)))
+__global__ void max_num_work_groups_empty_1_1() {}
+
+// expected-error@+1{{expected expression}}
+__attribute__((amdgpu_max_num_work_groups(32,,1)))
+__global__ void max_num_work_groups_32_empty_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(ipow2(5), 1, 1)))
+__global__ void max_num_work_groups_32_1_1_non_int_arg0() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, "1", 1)))
+__global__ void max_num_work_groups_32_1_1_non_int_arg1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(-32, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg0() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(32, -1, 1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, -1)))
+__global__ void max_num_work_groups_32_1_1_neg_int_arg2() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(0, 1, 1)))
+__global__ void max_num_work_groups_0_1_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(32, 0, 1)))
+__global__ void max_num_work_groups_32_0_1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, 0)))
+__global__ void max_num_work_groups_32_1_0() {}
+
+__attribute__((amdgpu_max_num_work_groups(4294967295)))
+__global__ void max_num_work_groups_max_unsigned_int() {}
+
+// expected-error@+1{{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
+__attribute__((amdgpu_max_num_work_groups(4294967296)))
+__global__ void max_num_work_groups_max_unsigned_int_plus1() {}
+
+// expected-error@+1{{integer constant expression evaluates to value 10000000000 that cannot be represented in a 32-bit unsigned integer type}}
+__attribute__((amdgpu_max_num_work_groups(10000000000)))
+__global__ void max_num_work_groups_too_large() {}
+
+int num_wg_x = 32;
+int num_wg_y = 1;
+int num_wg_z = 1;
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(num_wg_x, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg0() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, num_wg_y, 1)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg1() {}
+
+// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}}
+__attribute__((amdgpu_max_num_work_groups(32, 1, num_wg_z)))
+__global__ void max_num_work_groups_32_1_1_non_const_arg2() {}
+
+const int c_num_wg_x = 32;
+__attribute__((amdgpu_max_num_work_groups(c_num_wg_x, 1, 1)))
+__global__ void max_num_work_groups_32_1_1_const_arg0() {}
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(a, 1, 1)))
+__global__ void template_a_1_1_max_num_work_groups() {}
+template __global__ void template_a_1_1_max_num_work_groups<32>();
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, a, 1)))
+__global__ void template_32_a_1_max_num_work_groups() {}
+template __global__ void template_32_a_1_max_num_work_groups<1>();
+
+template<unsigned a>
+__attribute__((amdgpu_max_num_work_groups(32, 1, a)))
+__global__ void template_32_1_a_max_num_work_groups() {}
+template __global__ void template_32_1_a_max_num_work_groups<1>();
+
+// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note@+4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(b, 1, 1)))
+__global__ void template_b_1_1_max_num_work_groups() {}
+template __global__ void template_b_1_1_max_num_work_groups<0>();
+
+// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note@+4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(32, b, 1)))
+__global__ void template_32_b_1_max_num_work_groups() {}
+template __global__ void template_32_b_1_max_num_work_groups<0>();
+
+// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}}
+// expected-note@+4{{in instantiation of}}
+template<unsigned b>
+__attribute__((amdgpu_max_num_work_groups(32, 1, b)))
+__global__ void template_32_1_b_max_num_work_groups() {}
+template __global__ void template_32_1_b_max_num_work_groups<0>();
+
+
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1442,6 +1442,11 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                             the frame. This is an internal detail of how LDS variables are lowered,
                                             language front ends should not set this attribute.

+     "amdgpu-max-num-workgroups"="x,y,z"     Specify the maximum number of work groups for the kernel dispatch in the
+                                             X, Y, and Z dimensions. Generated by the ``amdgpu_max_num_work_groups``
+                                             CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all
+                                             the three numbers are >= 1.
+
     ======================================= ==========================================================

 Calling Conventions
@@ -3917,6 +3922,11 @@ same *vendor-name*.

                                                                  If omitted, "normal" is
                                                                  assumed.
+     ".max_num_work_groups_{x,y,z}"      integer                  The max number of
+                                                                  launched work-groups
+                                                                  in the X, Y, and Z
+                                                                  dimensions. Each number
+                                                                  must be >=1.
     =================================== ============== ========= ================================

 ..
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -494,6 +494,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,

  Kern[".max_flat_workgroup_size"] =
      Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
+  unsigned NumWGX = MFI.getMaxNumWorkGroupsX();
+  unsigned NumWGY = MFI.getMaxNumWorkGroupsY();
+  unsigned NumWGZ = MFI.getMaxNumWorkGroupsZ();
+  if (NumWGX != 0 && NumWGY != 0 && NumWGZ != 0) {
+    Kern[".max_num_workgroups_x"] = Kern.getDocument()->getNode(NumWGX);
+    Kern[".max_num_workgroups_y"] = Kern.getDocument()->getNode(NumWGY);
+    Kern[".max_num_workgroups_z"] = Kern.getDocument()->getNode(NumWGZ);
+  }
  Kern[".sgpr_spill_count"] =
      Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
  Kern[".vgpr_spill_count"] =
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -432,7 +432,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());

  // If minimum/maximum flat work group sizes were explicitly requested using
-  // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
+  // "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
  // number of waves per execution unit to values implied by requested
  // minimum/maximum flat work group sizes.
  unsigned MinImpliedByFlatWorkGroupSize =
@@ -1108,3 +1108,8 @@ void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
  return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
 }
+
+SmallVector<unsigned>
+AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
+  return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3);
+}
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -288,6 +288,9 @@ public:
  /// 2) dimension.
  unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;

+  /// Return the number of work groups for the function.
+  SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
+
  /// Return true if only a single workitem can be active in a wave.
  bool isSingleLaneExecution(const Function &Kernel) const;

--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -46,6 +46,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
  const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
  WavesPerEU = ST.getWavesPerEU(F);
+  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
+  assert(MaxNumWorkGroups.size() == 3);

  Occupancy = ST.computeOccupancy(F, getLDSSize());
  CallingConv::ID CC = F.getCallingConv();
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -426,6 +426,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,

  const AMDGPUGWSResourcePseudoSourceValue GWSResourcePSV;

+  // Default/requested number of work groups for the function.
+  SmallVector<unsigned> MaxNumWorkGroups = {0, 0, 0};
+
 private:
  unsigned NumUserSGPRs = 0;
  unsigned NumSystemSGPRs = 0;
@@ -1072,6 +1075,13 @@ public:

  // \returns true if a function needs or may need AGPRs.
  bool usesAGPRs(const MachineFunction &MF) const;
+
+  /// \returns Default/requested number of work groups for this function.
+  SmallVector<unsigned> getMaxNumWorkGroups() const { return MaxNumWorkGroups; }
+
+  unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
+  unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
+  unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 };

 } // end namespace llvm
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUAsmUtils.h"
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -1298,6 +1299,42 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
  return Ints;
 }

+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size) {
+  assert(Size > 2);
+  SmallVector<unsigned> Default(Size, 0);
+
+  Attribute A = F.getFnAttribute(Name);
+  if (!A.isStringAttribute())
+    return Default;
+
+  SmallVector<unsigned> Vals(Size, 0);
+
+  LLVMContext &Ctx = F.getContext();
+
+  StringRef S = A.getValueAsString();
+  unsigned i = 0;
+  for (; !S.empty() && i < Size; i++) {
+    std::pair<StringRef, StringRef> Strs = S.split(',');
+    unsigned IntVal;
+    if (Strs.first.trim().getAsInteger(0, IntVal)) {
+      Ctx.emitError("can't parse integer attribute " + Strs.first + " in " +
+                    Name);
+      return Default;
+    }
+    Vals[i] = IntVal;
+    S = Strs.second;
+  }
+
+  if (!S.empty() || i < Size) {
+    Ctx.emitError("attribute " + Name +
+                  " has incorrect number of integers; expected " +
+                  llvm::utostr(Size));
+    return Default;
+  }
+  return Vals;
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
  return (1 << (getVmcntBitWidthLo(Version.Major) +
                getVmcntBitWidthHi(Version.Major))) -
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -863,6 +863,14 @@ bool isReadOnlySegment(const GlobalValue *GV);
 /// target triple \p TT, false otherwise.
 bool shouldEmitConstantsToTextSection(const Triple &TT);

+/// \returns Integer value requested using \p F's \p Name attribute.
+///
+/// \returns \p Default if attribute is not present.
+///
+/// \returns \p Default and emits error if requested value cannot be converted
+/// to integer.
+int getIntegerAttribute(const Function &F, StringRef Name, int Default);
+
 /// \returns A pair of integer values requested using \p F's \p Name attribute
 /// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
 /// is false).
@@ -877,6 +885,16 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
                        std::pair<unsigned, unsigned> Default,
                        bool OnlyFirstRequired = false);

+/// \returns Generate a vector of integer values requested using \p F's \p Name
+/// attribute.
+///
+/// \returns true if exactly Size (>2) number of integers are found in the
+/// attribute.
+///
+/// \returns false if any error occurs.
+SmallVector<unsigned> getIntegerVecAttribute(const Function &F, StringRef Name,
+                                             unsigned Size);
+
 /// Represents the counter values to wait for in an s_waitcnt instruction.
 ///
 /// Large values (including the maximum possible integer) can be used to
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups.ll
@@ -0,0 +1,84 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Attribute not specified.
+; CHECK-LABEL: {{^}}empty_no_attribute:
+define amdgpu_kernel void @empty_no_attribute() {
+entry:
+  ret void
+}
+
+; Ignore if number of work groups for x dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_x0:
+define amdgpu_kernel void @empty_max_num_workgroups_x0() #0 {
+entry:
+  ret void
+}
+attributes #0 = {"amdgpu-max-num-workgroups"="0,2,3"}
+
+; Ignore if number of work groups for y dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_y0:
+define amdgpu_kernel void @empty_max_num_workgroups_y0() #1 {
+entry:
+  ret void
+}
+attributes #1 = {"amdgpu-max-num-workgroups"="1,0,3"}
+
+; Ignore if number of work groups for z dimension is 0.
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_z0:
+define amdgpu_kernel void @empty_max_num_workgroups_z0() #2 {
+entry:
+  ret void
+}
+attributes #2 = {"amdgpu-max-num-workgroups"="1,2,0"}
+
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_1_2_3:
+define amdgpu_kernel void @empty_max_num_workgroups_1_2_3() #3 {
+entry:
+  ret void
+}
+attributes #3 = {"amdgpu-max-num-workgroups"="1,2,3"}
+
+; CHECK-LABEL: {{^}}empty_max_num_workgroups_1024_1024_1024:
+define amdgpu_kernel void @empty_max_num_workgroups_1024_1024_1024() #4 {
+entry:
+  ret void
+}
+attributes #4 = {"amdgpu-max-num-workgroups"="1024,1024,1024"}
+
+
+; CHECK: .amdgpu_metadata
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_no_attribute
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_x0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_y0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_z0
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_workgroups_x: 1
+; CHECK-NEXT:   .max_num_workgroups_y: 2
+; CHECK-NEXT:   .max_num_workgroups_z: 3
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_1_2_3
+; CHECK-NEXT:   .private_segment_fixed_size: 0
+
+; CHECK: - .args:
+; CHECK:        .max_flat_workgroup_size: 1024
+; CHECK-NEXT:   .max_num_workgroups_x: 1024
+; CHECK-NEXT:   .max_num_workgroups_y: 1024
+; CHECK-NEXT:   .max_num_workgroups_z: 1024
+; CHECK-NEXT:   .name:           empty_max_num_workgroups_1024_1024_1024
+; CHECK-NEXT:   .private_segment_fixed_size: 0
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups_error_check.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-workgroups_error_check.ll
@@ -0,0 +1,71 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s 2>&1 | FileCheck --check-prefix=ERROR %s
+
+; ERROR: error: can't parse integer attribute -1 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_neg_num1() #21 {
+entry:
+  ret void
+}
+attributes #21 = {"amdgpu-max-num-workgroups"="-1,2,3"}
+
+; ERROR: error: can't parse integer attribute -2 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_neg_num2() #22 {
+entry:
+  ret void
+}
+attributes #22 = {"amdgpu-max-num-workgroups"="1,-2,3"}
+
+; ERROR: error: can't parse integer attribute -3 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_neg_num3() #23 {
+entry:
+  ret void
+}
+attributes #23 = {"amdgpu-max-num-workgroups"="1,2,-3"}
+
+; ERROR: error: can't parse integer attribute 1.0 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_non_int1() #31 {
+entry:
+  ret void
+}
+attributes #31 = {"amdgpu-max-num-workgroups"="1.0,2,3"}
+
+; ERROR: error: can't parse integer attribute 2.0 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_non_int2() #32 {
+entry:
+  ret void
+}
+attributes #32 = {"amdgpu-max-num-workgroups"="1,2.0,3"}
+
+; ERROR: error: can't parse integer attribute 3.0 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_non_int3() #33 {
+entry:
+  ret void
+}
+attributes #33 = {"amdgpu-max-num-workgroups"="1,2,3.0"}
+
+; ERROR: error: can't parse integer attribute 10000000000 in amdgpu-max-num-workgroups
+define amdgpu_kernel void @empty_max_num_workgroups_too_large() #41 {
+entry:
+  ret void
+}
+attributes #41 = {"amdgpu-max-num-workgroups"="10000000000,2,3"}
+
+; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
+define amdgpu_kernel void @empty_max_num_workgroups_1_arg() #51 {
+entry:
+  ret void
+}
+attributes #51 = {"amdgpu-max-num-workgroups"="1"}
+
+; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
+define amdgpu_kernel void @empty_max_num_workgroups_2_args() #52 {
+entry:
+  ret void
+}
+attributes #52 = {"amdgpu-max-num-workgroups"="1,2"}
+
+; ERROR: error: attribute amdgpu-max-num-workgroups has incorrect number of integers; expected 3
+define amdgpu_kernel void @empty_max_num_workgroups_4_args() #53 {
+entry:
+  ret void
+}
+attributes #53 = {"amdgpu-max-num-workgroups"="1,2,3,4"}