Revert "[AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#… (#144039)

…133242)" This reverts commit 130080fab1 because it causes issues in testcases similar to coalescer_remat.ll [1], i.e. when we use a VGPR tuple but only write to its lower parts. The high VGPRs would then not be included in the vgpr_count, and accessing them would be an out of bounds violation. [1] https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
2025-06-13 12:48:24 +02:00
parent 9eef4d1c5f
commit a5cbd2ab0b
48 changed files with 472 additions and 586 deletions
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4263,9 +4263,10 @@ same *vendor-name*.
                                                                  wavefront for
                                                                  GFX6-GFX9. A register
                                                                  is required if it is
-                                                                  written to, or
+                                                                  used explicitly, or
                                                                  if a higher numbered
-                                                                  register is written to. This
+                                                                  register is used
+                                                                  explicitly. This
                                                                  includes the special
                                                                  SGPRs for VCC, Flat
                                                                  Scratch (GFX7-GFX9)
@@ -4283,10 +4284,10 @@ same *vendor-name*.
                                                                  each work-item for
                                                                  GFX6-GFX9. A register
                                                                  is required if it is
-                                                                  written to, or
+                                                                  used explicitly, or
                                                                  if a higher numbered
-                                                                  register is
-                                                                  written to.
+                                                                  register is used
+                                                                  explicitly.
     ".agpr_count"                       integer        Required  Number of accumulator
                                                                  registers required by
                                                                  each work-item for
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
  // dispatch registers are function args.
  unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;

-  if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
+  if (isShader(F.getCallingConv())) {
    bool IsPixelShader =
        F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();

@@ -1060,6 +1060,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,

    ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
        ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+  } else if (isKernel(F.getCallingConv()) &&
+             MFI->getNumKernargPreloadedSGPRs()) {
+    // Consider cases where the total number of UserSGPRs with trailing
+    // allocated preload SGPRs, is greater than the number of explicitly
+    // referenced SGPRs.
+    const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
+        CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
+    ProgInfo.NumSGPR =
+        AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
  }

  // Adjust number of registers used to meet default/requested minimum/maximum
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -137,29 +137,274 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
  if (MFI->isStackRealigned())
    Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();

-  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
+  Info.UsesVCC =
+      MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);

-  Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
-  Info.NumExplicitSGPR =
-      TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
-  if (ST.hasMAIInsts())
-    Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
-
-  // Preloaded registers are written by the hardware, not defined in the
-  // function body, so they need special handling.
-  if (MFI->isEntryFunction()) {
-    Info.NumExplicitSGPR =
-        std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
-    Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
+  // If there are no calls, MachineRegisterInfo can tell us the used register
+  // count easily.
+  // A tail call isn't considered a call for MachineFrameInfo's purposes.
+  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
+    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
+    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
+    if (ST.hasMAIInsts())
+      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+    return Info;
  }

-  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
-    return Info;
-
+  int32_t MaxVGPR = -1;
+  int32_t MaxAGPR = -1;
+  int32_t MaxSGPR = -1;
  Info.CalleeSegmentSize = 0;

  for (const MachineBasicBlock &MBB : MF) {
    for (const MachineInstr &MI : MBB) {
+      // TODO: Check regmasks? Do they occur anywhere except calls?
+      for (const MachineOperand &MO : MI.operands()) {
+        unsigned Width = 0;
+        bool IsSGPR = false;
+        bool IsAGPR = false;
+
+        if (!MO.isReg())
+          continue;
+
+        Register Reg = MO.getReg();
+        switch (Reg) {
+        case AMDGPU::EXEC:
+        case AMDGPU::EXEC_LO:
+        case AMDGPU::EXEC_HI:
+        case AMDGPU::SCC:
+        case AMDGPU::M0:
+        case AMDGPU::M0_LO16:
+        case AMDGPU::M0_HI16:
+        case AMDGPU::SRC_SHARED_BASE_LO:
+        case AMDGPU::SRC_SHARED_BASE:
+        case AMDGPU::SRC_SHARED_LIMIT_LO:
+        case AMDGPU::SRC_SHARED_LIMIT:
+        case AMDGPU::SRC_PRIVATE_BASE_LO:
+        case AMDGPU::SRC_PRIVATE_BASE:
+        case AMDGPU::SRC_PRIVATE_LIMIT_LO:
+        case AMDGPU::SRC_PRIVATE_LIMIT:
+        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+        case AMDGPU::SGPR_NULL:
+        case AMDGPU::SGPR_NULL64:
+        case AMDGPU::MODE:
+          continue;
+
+        case AMDGPU::NoRegister:
+          assert(MI.isDebugInstr() &&
+                 "Instruction uses invalid noreg register");
+          continue;
+
+        case AMDGPU::VCC:
+        case AMDGPU::VCC_LO:
+        case AMDGPU::VCC_HI:
+        case AMDGPU::VCC_LO_LO16:
+        case AMDGPU::VCC_LO_HI16:
+        case AMDGPU::VCC_HI_LO16:
+        case AMDGPU::VCC_HI_HI16:
+          Info.UsesVCC = true;
+          continue;
+
+        case AMDGPU::FLAT_SCR:
+        case AMDGPU::FLAT_SCR_LO:
+        case AMDGPU::FLAT_SCR_HI:
+          continue;
+
+        case AMDGPU::XNACK_MASK:
+        case AMDGPU::XNACK_MASK_LO:
+        case AMDGPU::XNACK_MASK_HI:
+          llvm_unreachable("xnack_mask registers should not be used");
+
+        case AMDGPU::LDS_DIRECT:
+          llvm_unreachable("lds_direct register should not be used");
+
+        case AMDGPU::TBA:
+        case AMDGPU::TBA_LO:
+        case AMDGPU::TBA_HI:
+        case AMDGPU::TMA:
+        case AMDGPU::TMA_LO:
+        case AMDGPU::TMA_HI:
+          llvm_unreachable("trap handler registers should not be used");
+
+        case AMDGPU::SRC_VCCZ:
+          llvm_unreachable("src_vccz register should not be used");
+
+        case AMDGPU::SRC_EXECZ:
+          llvm_unreachable("src_execz register should not be used");
+
+        case AMDGPU::SRC_SCC:
+          llvm_unreachable("src_scc register should not be used");
+
+        default:
+          break;
+        }
+
+        if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
+            AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
+            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 1;
+        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 1;
+        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 1;
+        } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 2;
+        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 2;
+        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 2;
+        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 3;
+        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 3;
+        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 3;
+        } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 4;
+        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 4;
+        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 4;
+        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 5;
+        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 5;
+        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 5;
+        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 6;
+        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 6;
+        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 6;
+        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 7;
+        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 7;
+        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 7;
+        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 8;
+        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 8;
+        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 8;
+        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 9;
+        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 9;
+        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 9;
+        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 10;
+        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 10;
+        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 10;
+        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 11;
+        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 11;
+        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 11;
+        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 12;
+        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 12;
+        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 12;
+        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 16;
+        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 16;
+        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 16;
+        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 32;
+        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 32;
+        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 32;
+        } else {
+          // We only expect TTMP registers or registers that do not belong to
+          // any RC.
+          assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_64RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_128RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_256RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_512RegClass.contains(Reg) ||
+                  !TRI.getPhysRegBaseClass(Reg)) &&
+                 "Unknown register class");
+        }
+        unsigned HWReg = TRI.getHWRegIndex(Reg);
+        int MaxUsed = HWReg + Width - 1;
+        if (IsSGPR) {
+          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+        } else if (IsAGPR) {
+          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
+        } else {
+          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
+        }
+      }
+
      if (MI.isCall()) {
        // Pseudo used just to encode the underlying global. Is there a better
        // way to track this?
@@ -219,5 +464,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
    }
  }

+  Info.NumExplicitSGPR = MaxSGPR + 1;
+  Info.NumVGPR = MaxVGPR + 1;
+  Info.NumAGPR = MaxAGPR + 1;
+
  return Info;
 }
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -970,25 +970,10 @@ public:
    return NumUserSGPRs;
  }

-  // Get the number of preloaded SGPRs for compute kernels.
  unsigned getNumPreloadedSGPRs() const {
    return NumUserSGPRs + NumSystemSGPRs;
  }

-  // Get the number of preloaded VGPRs for compute kernels.
-  unsigned getNumPreloadedVGPRs() const {
-    if (hasWorkItemIDZ())
-      return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
-
-    if (hasWorkItemIDY())
-      return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
-
-    if (hasWorkItemIDX())
-      return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
-
-    return 0;
-  }
-
  unsigned getNumKernargPreloadedSGPRs() const {
    return UserSGPRInfo.getNumKernargPreloadSGPRs();
  }
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4055,20 +4055,6 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
  return 0;
 }

-unsigned
-SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
-                                      const TargetRegisterClass &RC) const {
-  for (MCPhysReg Reg : reverse(RC.getRegisters())) {
-    for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) {
-      if (llvm::any_of(MRI.def_instructions(*AI), [](const MachineInstr &MI) {
-            return !MI.isImplicitDef();
-          }))
-        return getHWRegIndex(Reg) + 1;
-    }
-  }
-  return 0;
-}
-
 SmallVector<StringLiteral>
 SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
                                  const MachineFunction &MF) const {
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,11 +486,6 @@ public:
  unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
                              const TargetRegisterClass &RC) const;

-  // \returns the number of registers of a given \p RC defined in a function.
-  // Does not go inside function calls.
-  unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
-                                 const TargetRegisterClass &RC) const;
-
  std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
    return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
                             : std::optional<uint8_t>{};
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     wavefront_sgpr_count = 17
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
+; GFX10-NEXT:     wavefront_sgpr_count = 10
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 7
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
-; GPRIDX-NEXT:     workitem_vgpr_count = 3
+; GPRIDX-NEXT:     wavefront_sgpr_count = 16
+; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
-; GFX10-NEXT:     workitem_vgpr_count = 3
+; GFX10-NEXT:     wavefront_sgpr_count = 10
+; GFX10-NEXT:     workitem_vgpr_count = 2
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
 ; GFX10-NEXT:     reserved_sgpr_first = 0
@@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 6
 ; GFX11-NEXT:     workitem_vgpr_count = 2
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     wavefront_sgpr_count = 16
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
+; GFX10-NEXT:     wavefront_sgpr_count = 10
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 7
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
@@ -13,9 +13,8 @@
 ; CHECK: {{^}}kernel_illegal_agpr_use_asm:
 ; CHECK: ; use a0

-; GFX908: NumVgprs: 3
-; GFX90A: NumVgprs: 1
-; CHECK: NumAgprs: 0
+; CHECK: NumVgprs: 0
+; CHECK: NumAgprs: 1
 define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
  call void asm sideeffect "; use $0", "a"(i32 poison)
  ret void
@@ -25,7 +24,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
 ; CHECK: ; use a0

 ; CHECK: NumVgprs: 0
-; CHECK: NumAgprs: 0
+; CHECK: NumAgprs: 1
 define void @func_illegal_agpr_use_asm() #0 {
  call void asm sideeffect "; use $0", "a"(i32 poison)
  ret void
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -10,9 +10,9 @@

 ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
 ; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 15
-; ASM: ; TotalNumSgprs: 21
-; ASM: ; NumSGPRsForWavesPerEU: 21
+; ASM: .amdhsa_next_free_sgpr 12
+; ASM: ; TotalNumSgprs: 18
+; ASM: ; NumSGPRsForWavesPerEU: 18

 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
 ; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
 ; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
 ; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000  ................
+; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............

 ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 11
-; ASM: ; TotalNumSgprs: 17
-; ASM: ; NumSGPRsForWavesPerEU: 17
+; ASM: .amdhsa_next_free_sgpr 10
+; ASM: ; TotalNumSgprs: 16
+; ASM: ; NumSGPRsForWavesPerEU: 16

 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
 ; implicit, and 6 extra.
@@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {

 ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 4
-; ASM: ; TotalNumSgprs: 10
-; ASM: ; NumSGPRsForWavesPerEU: 10
+; ASM: .amdhsa_next_free_sgpr 3
+; ASM: ; TotalNumSgprs: 9
+; ASM: ; NumSGPRsForWavesPerEU: 9

 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.

@@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
 ; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
 ; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000  @...............
+; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................

 ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM: .amdhsa_next_free_sgpr 0
+; ASM: ; TotalNumSgprs: 6
+; ASM: ; NumSGPRsForWavesPerEU: 6

 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
 ; Encoded like '00'.
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }

 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GFX8-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}}
-; GFX9-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}}
+; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
 ; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -164,13 +164,13 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      multiple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x21{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x24{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      no_stack:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x20{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      no_stack_call:
@@ -203,7 +203,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x20{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      simple_lds_recurse:
@@ -215,7 +215,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x14{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x21{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      simple_stack_call:
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s

 ; ELF: Section {
 ; ELF: Name: .text
@@ -23,16 +23,8 @@
 ; ELF: Section: .text (0x2)
 ; ELF: }

-; GFX10: NumSGPRsForWavesPerEU: 12
-; GFX10: NumVGPRsForWavesPerEU: 3
-
-; Wave32 and 64 behave differently due to the UserSGPRInit16Bug,
-; which only affects Wave32.
-; GFX11W32: NumSGPRsForWavesPerEU: 16
-; GFX11W32: NumVGPRsForWavesPerEU: 1
-
-; GFX11W64: NumSGPRsForWavesPerEU: 11
-; GFX11W64: NumVGPRsForWavesPerEU: 1
+; GFX10: NumSGPRsForWavesPerEU: 6
+; GFX10: NumVGPRsForWavesPerEU: 1

 define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
 entry:
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s

 ; CHECK-LABEL: {{^}}min_64_max_64:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
  ret void
@@ -13,10 +13,10 @@ entry:
 attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}

 ; CHECK-LABEL: {{^}}min_64_max_128:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
  ret void
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}

 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
  ret void
@@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}

 ; At least 1 wave per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_1:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
  ret void
@@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}

 ; At least 5 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_5:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
  ret void
@@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}

 ; At least 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
  ret void
@@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64

 ; At most 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_most_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
  ret void
@@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}

 ; Between 5 and 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_between_5_and_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
  ret void
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -28,7 +28,7 @@ bb:
 }
 ; ALL:      .set .Laliasee_default.num_vgpr, 0
 ; ALL-NEXT: .set .Laliasee_default.num_agpr, 27
-; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0
+; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32

 attributes #0 = { noinline norecurse nounwind optnone }
 attributes #1 = { noinline norecurse nounwind readnone willreturn }
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -18,7 +18,7 @@ bb:

 ; CHECK:      .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
 define internal void @aliasee_default_vgpr64_sgpr102() #1 {
 bb:
  call void asm sideeffect "; clobber v52 ", "~{v52}"()
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -24,7 +24,7 @@ bb:

 ; CHECK:      .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27
 ; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32
 define internal void @aliasee_vgpr32_sgpr76() #1 {
 bb:
  call void asm sideeffect "; clobber v26 ", "~{v26}"()
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -21,7 +21,7 @@ bb:

 ; CHECK:      .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32
 define internal void @aliasee_vgpr64_sgpr102() #1 {
 bb:
  call void asm sideeffect "; clobber v52 ", "~{v52}"()
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -21,7 +21,7 @@ bb:

 ; CHECK:      .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253
 ; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33
 define internal void @aliasee_vgpr256_sgpr102() #1 {
 bb:
  call void asm sideeffect "; clobber v252 ", "~{v252}"()
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -7,7 +7,7 @@
 ; Make sure to run a GPU with the SGPR allocation bug.

 ; GCN-LABEL: {{^}}use_vcc:
-; GCN: ; TotalNumSgprs: 2
+; GCN: ; TotalNumSgprs: 34
 ; GCN: ; NumVgprs: 0
 define void @use_vcc() #1 {
  call void asm sideeffect "", "~{vcc}" () #0
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 }

 ; GCN-LABEL: {{^}}use_flat_scratch:
-; CI: ; TotalNumSgprs: 4
-; VI: ; TotalNumSgprs: 6
+; CI: ; TotalNumSgprs: 36
+; VI: ; TotalNumSgprs: 38
 ; GCN: ; NumVgprs: 0
 define void @use_flat_scratch() #1 {
  call void asm sideeffect "", "~{flat_scratch}" () #0
@@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
@@ -249,7 +249,7 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 5
+; CHECK: ; NumVgprs: 8
 define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
 entry:
  %cmpflag = icmp eq i32 %flag, 1
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -16,7 +16,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -35,7 +35,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -93,7 +93,7 @@ entry:
 ; registers used.
 ;
 ; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM:     .amdhsa_next_free_vgpr 3
+; ALL-ASM:     .amdhsa_next_free_vgpr 1
 ; ALL-ASM:     .amdhsa_next_free_sgpr 1
 define amdgpu_kernel void @empty(
    i32 %i,
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -43,7 +43,7 @@
 ; OSABI-HSA-ELF:     .sgpr_count:     96
 ; OSABI-HSA-ELF:     .sgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .symbol:         elf_notes.kd
-; OSABI-HSA-ELF:     .vgpr_count:     1
+; OSABI-HSA-ELF:     .vgpr_count:     0
 ; OSABI-HSA-ELF:     .vgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .wavefront_size: 64
 ; OSABI-HSA-ELF: amdhsa.target:   amdgcn-amd-amdhsa--gfx802
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -27,15 +27,15 @@
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1

-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 12
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 18
-; VI-XNACK: ; TotalNumSgprs: 16
-; HSA-VI-XNACK: ; TotalNumSgprs: 22
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; CI: ; TotalNumSgprs: 8
+; VI-NOXNACK: ; TotalNumSgprs: 8
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
+; VI-XNACK: ; TotalNumSgprs: 12
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
  call void asm sideeffect "", "~{s7}"()
@@ -50,15 +50,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1

-; CI: ; TotalNumSgprs: 14
-; VI-NOXNACK: ; TotalNumSgprs: 14
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 20
-; VI-XNACK: ; TotalNumSgprs: 16
-; HSA-VI-XNACK: ; TotalNumSgprs: 22
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
+; CI: ; TotalNumSgprs: 10
+; VI-NOXNACK: ; TotalNumSgprs: 10
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
+; VI-XNACK: ; TotalNumSgprs: 12
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
  call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -73,15 +73,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1

-; CI: ; TotalNumSgprs: 16
-; VI-NOXNACK: ; TotalNumSgprs: 18
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 14
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
  call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -96,15 +96,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1

-; CI: ; TotalNumSgprs: 16
-; VI-NOXNACK: ; TotalNumSgprs: 18
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 14
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
  call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -122,15 +122,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1

-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
  call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -143,15 +143,15 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1

-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
  call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -166,15 +166,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1

-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
  call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -5,14 +5,14 @@
 ; GCN-LABEL: {{^}}use_vcc:
 ; GCN: .set use_vcc.num_vgpr, 0
 ; GCN: .set use_vcc.num_agpr, 0
-; GCN: .set use_vcc.numbered_sgpr, 0
+; GCN: .set use_vcc.numbered_sgpr, 32
 ; GCN: .set use_vcc.private_seg_size, 0
 ; GCN: .set use_vcc.uses_vcc, 1
 ; GCN: .set use_vcc.uses_flat_scratch, 0
 ; GCN: .set use_vcc.has_dyn_sized_stack, 0
 ; GCN: .set use_vcc.has_recursion, 0
 ; GCN: .set use_vcc.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_vcc() #1 {
@@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 ; GCN-LABEL: {{^}}use_flat_scratch:
 ; GCN: .set use_flat_scratch.num_vgpr, 0
 ; GCN: .set use_flat_scratch.num_agpr, 0
-; GCN: .set use_flat_scratch.numbered_sgpr, 0
+; GCN: .set use_flat_scratch.numbered_sgpr, 32
 ; GCN: .set use_flat_scratch.private_seg_size, 0
 ; GCN: .set use_flat_scratch.uses_vcc, 0
 ; GCN: .set use_flat_scratch.uses_flat_scratch, 1
 ; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
 ; GCN: .set use_flat_scratch.has_recursion, 0
 ; GCN: .set use_flat_scratch.has_indirect_call, 0
-; GCN: TotalNumSgprs: 6
+; GCN: TotalNumSgprs: 38
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_flat_scratch() #1 {
@@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
 ; GCN-LABEL: {{^}}use_10_vgpr:
 ; GCN: .set use_10_vgpr.num_vgpr, 10
 ; GCN: .set use_10_vgpr.num_agpr, 0
-; GCN: .set use_10_vgpr.numbered_sgpr, 0
+; GCN: .set use_10_vgpr.numbered_sgpr, 32
 ; GCN: .set use_10_vgpr.private_seg_size, 0
 ; GCN: .set use_10_vgpr.uses_vcc, 0
 ; GCN: .set use_10_vgpr.uses_flat_scratch, 0
 ; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
 ; GCN: .set use_10_vgpr.has_recursion, 0
 ; GCN: .set use_10_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 10
 ; GCN: ScratchSize: 0
 define void @use_10_vgpr() #1 {
@@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
 ; GCN-LABEL: {{^}}use_50_vgpr:
 ; GCN:	.set use_50_vgpr.num_vgpr, 50
 ; GCN:	.set use_50_vgpr.num_agpr, 0
-; GCN:	.set use_50_vgpr.numbered_sgpr, 0
+; GCN:	.set use_50_vgpr.numbered_sgpr, 32
 ; GCN:	.set use_50_vgpr.private_seg_size, 0
 ; GCN:	.set use_50_vgpr.uses_vcc, 0
 ; GCN:	.set use_50_vgpr.uses_flat_scratch, 0
 ; GCN:	.set use_50_vgpr.has_dyn_sized_stack, 0
 ; GCN:	.set use_50_vgpr.has_recursion, 0
 ; GCN:	.set use_50_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 50
 ; GCN: ScratchSize: 0
 define void @use_50_vgpr() #1 {
@@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
 ; GCN-LABEL: {{^}}use_stack0:
 ; GCN:	.set use_stack0.num_vgpr, 1
 ; GCN:	.set use_stack0.num_agpr, 0
-; GCN:	.set use_stack0.numbered_sgpr, 0
+; GCN:	.set use_stack0.numbered_sgpr, 33
 ; GCN:	.set use_stack0.private_seg_size, 2052
 ; GCN:	.set use_stack0.uses_vcc, 0
 ; GCN:	.set use_stack0.uses_flat_scratch, 0
 ; GCN:	.set use_stack0.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack0.has_recursion, 0
 ; GCN:	.set use_stack0.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 37
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 2052
 define void @use_stack0() #1 {
@@ -277,14 +277,14 @@ define void @use_stack0() #1 {
 ; GCN-LABEL: {{^}}use_stack1:
 ; GCN:	.set use_stack1.num_vgpr, 1
 ; GCN:	.set use_stack1.num_agpr, 0
-; GCN:	.set use_stack1.numbered_sgpr, 0
+; GCN:	.set use_stack1.numbered_sgpr, 33
 ; GCN:	.set use_stack1.private_seg_size, 404
 ; GCN:	.set use_stack1.uses_vcc, 0
 ; GCN:	.set use_stack1.uses_flat_scratch, 0
 ; GCN:	.set use_stack1.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack1.has_recursion, 0
 ; GCN:	.set use_stack1.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 37
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 404
 define void @use_stack1() #1 {
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,7 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; CHECK:     .sgpr_count:     16
+; CHECK:     .sgpr_count:     10
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -63,7 +63,7 @@
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
-; ELF:   0250: 6770725F 636F756E 7401B12E 76677072
+; ELF:   0250: 6770725F 636F756E 7402B12E 76677072
 ; ELF:   0260: 5F737069 6C6C5F63 6F756E74 00AF2E77
 ; ELF:   0270: 61766566 726F6E74 5F73697A 6540AD61
 ; ELF:   0280: 6D646873 612E7461 72676574 BD616D64
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
@@ -1,72 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Use VGPRs above the input arguments.
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  call void asm sideeffect "; clobber v28", "~{v28}"()
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
@@ -1,46 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; CHECK-LABEL: leaf_shader:
-; CHECK: .vgpr_count:{{.*}}0x1{{$}}
-
-; Function without calls.
-define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
-                              i32 %active.vgpr1, i32 %active.vgpr2,
-                              i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                              i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
-                              local_unnamed_addr {
-entry:
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %compute, label %merge
-
-compute:
-  ; Perform a more complex computation using active VGPRs
-  %square = mul i32 %active.vgpr1, %active.vgpr1
-  %product = mul i32 %square, %active.vgpr2
-  %sum = add i32 %product, %input.value
-  %result = add i32 %sum, 42
-  br label %merge
-
-merge:
-  %final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
-
-  store i32 %final.result, ptr %output.ptr, align 4
-
-  ret void
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
@@ -1,74 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
-; In that case, the VGPR should be included in the .vgpr_count
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0xd{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
@@ -1,71 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0xa{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
 ; GCN-NEXT: s_addc_u32 s17,
 ; GCN-NEXT: s_setpc_b64 s[16:17]

-; GCN: ; TotalNumSgprs: 18
+; GCN: ; TotalNumSgprs: 32
 ; GCN: ; NumVgprs: 8
 define void @func_tail_call() #1 {
  tail call void @func()
--- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
@@ -24,7 +24,7 @@ define void @I_Quit() {
 ; CHECK-LABEL: P_RemoveMobj:
 ; CHECK: .set P_RemoveMobj.num_vgpr, 0
 ; CHECK: .set P_RemoveMobj.num_agpr, 0
-; CHECK: .set P_RemoveMobj.numbered_sgpr, 0
+; CHECK: .set P_RemoveMobj.numbered_sgpr, 32
 ; CHECK: .set P_RemoveMobj.private_seg_size, 0
 ; CHECK: .set P_RemoveMobj.uses_vcc, 0
 ; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0
@@ -38,7 +38,7 @@ define void @P_RemoveMobj() {
 ; CHECK-LABEL: P_SpawnMobj:
 ; CHECK: .set P_SpawnMobj.num_vgpr, 0
 ; CHECK: .set P_SpawnMobj.num_agpr, 0
-; CHECK: .set P_SpawnMobj.numbered_sgpr, 0
+; CHECK: .set P_SpawnMobj.numbered_sgpr, 32
 ; CHECK: .set P_SpawnMobj.private_seg_size, 0
 ; CHECK: .set P_SpawnMobj.uses_vcc, 0
 ; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0
@@ -52,7 +52,7 @@ define void @P_SpawnMobj() {
 ; CHECK-LABEL: G_PlayerReborn:
 ; CHECK: .set G_PlayerReborn.num_vgpr, 0
 ; CHECK: .set G_PlayerReborn.num_agpr, 0
-; CHECK: .set G_PlayerReborn.numbered_sgpr, 0
+; CHECK: .set G_PlayerReborn.numbered_sgpr, 32
 ; CHECK: .set G_PlayerReborn.private_seg_size, 0
 ; CHECK: .set G_PlayerReborn.uses_vcc, 0
 ; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0
@@ -66,7 +66,7 @@ define void @G_PlayerReborn() {
 ; CHECK-LABEL: P_SetThingPosition:
 ; CHECK: .set P_SetThingPosition.num_vgpr, 0
 ; CHECK: .set P_SetThingPosition.num_agpr, 0
-; CHECK: .set P_SetThingPosition.numbered_sgpr, 0
+; CHECK: .set P_SetThingPosition.numbered_sgpr, 32
 ; CHECK: .set P_SetThingPosition.private_seg_size, 0
 ; CHECK: .set P_SetThingPosition.uses_vcc, 0
 ; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0
@@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) {
 ; CHECK-LABEL: HU_Start:
 ; CHECK: .set HU_Start.num_vgpr, 0
 ; CHECK: .set HU_Start.num_agpr, 0
-; CHECK: .set HU_Start.numbered_sgpr, 0
+; CHECK: .set HU_Start.numbered_sgpr, 32
 ; CHECK: .set HU_Start.private_seg_size, 0
 ; CHECK: .set HU_Start.uses_vcc, 0
 ; CHECK: .set HU_Start.uses_flat_scratch, 0
@@ -162,7 +162,7 @@ define void @G_DoReborn() {
 ; CHECK-LABEL: AM_Stop:
 ; CHECK: .set AM_Stop.num_vgpr, 0
 ; CHECK: .set AM_Stop.num_agpr, 0
-; CHECK: .set AM_Stop.numbered_sgpr, 0
+; CHECK: .set AM_Stop.numbered_sgpr, 32
 ; CHECK: .set AM_Stop.private_seg_size, 0
 ; CHECK: .set AM_Stop.uses_vcc, 0
 ; CHECK: .set AM_Stop.uses_flat_scratch, 0
@@ -176,7 +176,7 @@ define void @AM_Stop() {
 ; CHECK-LABEL: D_AdvanceDemo:
 ; CHECK: .set D_AdvanceDemo.num_vgpr, 0
 ; CHECK: .set D_AdvanceDemo.num_agpr, 0
-; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0
+; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32
 ; CHECK: .set D_AdvanceDemo.private_seg_size, 0
 ; CHECK: .set D_AdvanceDemo.uses_vcc, 0
 ; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0
@@ -190,7 +190,7 @@ define void @D_AdvanceDemo() {
 ; CHECK-LABEL: F_StartFinale:
 ; CHECK: .set F_StartFinale.num_vgpr, 0
 ; CHECK: .set F_StartFinale.num_agpr, 0
-; CHECK: .set F_StartFinale.numbered_sgpr, 0
+; CHECK: .set F_StartFinale.numbered_sgpr, 32
 ; CHECK: .set F_StartFinale.private_seg_size, 0
 ; CHECK: .set F_StartFinale.uses_vcc, 0
 ; CHECK: .set F_StartFinale.uses_flat_scratch, 0
@@ -204,7 +204,7 @@ define void @F_StartFinale() {
 ; CHECK-LABEL: F_Ticker:
 ; CHECK: .set F_Ticker.num_vgpr, 0
 ; CHECK: .set F_Ticker.num_agpr, 0
-; CHECK: .set F_Ticker.numbered_sgpr, 0
+; CHECK: .set F_Ticker.numbered_sgpr, 32
 ; CHECK: .set F_Ticker.private_seg_size, 0
 ; CHECK: .set F_Ticker.uses_vcc, 0
 ; CHECK: .set F_Ticker.uses_flat_scratch, 0
@@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() {
 ; CHECK-LABEL: P_TempSaveGameFile:
 ; CHECK: .set P_TempSaveGameFile.num_vgpr, 2
 ; CHECK: .set P_TempSaveGameFile.num_agpr, 0
-; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0
+; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32
 ; CHECK: .set P_TempSaveGameFile.private_seg_size, 0
 ; CHECK: .set P_TempSaveGameFile.uses_vcc, 0
 ; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0
@@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() {
 ; CHECK-LABEL: P_SaveGameFile:
 ; CHECK: .set P_SaveGameFile.num_vgpr, 2
 ; CHECK: .set P_SaveGameFile.num_agpr, 0
-; CHECK: .set P_SaveGameFile.numbered_sgpr, 0
+; CHECK: .set P_SaveGameFile.numbered_sgpr, 32
 ; CHECK: .set P_SaveGameFile.private_seg_size, 0
 ; CHECK: .set P_SaveGameFile.uses_vcc, 0
 ; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
 ; CHECK-LABEL: R_FlatNumForName:
 ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
 ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
-; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr)
+; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
 ; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size)
 ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
 ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)
--- a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}qux
 ; CHECK: .set qux.num_vgpr, 13
 ; CHECK: .set qux.num_agpr, 0
-; CHECK: .set qux.numbered_sgpr, 0
+; CHECK: .set qux.numbered_sgpr, 32
 ; CHECK: .set qux.private_seg_size, 0
 ; CHECK: .set qux.uses_vcc, 0
 ; CHECK: .set qux.uses_flat_scratch, 0
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -83,13 +83,13 @@
 ; CHECK-NEXT:      multiple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x24
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x21
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x24
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      no_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x20
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      no_stack_call:
@@ -122,7 +122,7 @@
 ; CHECK-NEXT:      simple_lds:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0x100
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x20
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      simple_lds_recurse:
@@ -134,7 +134,7 @@
 ; CHECK-NEXT:      simple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x14
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x21
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x14
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      simple_stack_call:
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -2,7 +2,7 @@
 ;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK

 ; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
-; ;CHECK: NumVgprs: 2
+; ;CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
  %i1 = extractelement <2 x float> %arg3, i32 1
@@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float>

 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
-; CHECK: NumVgprs: 2
+; CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
  ret { <4 x float> } undef
@@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; Additionally set the PSInputAddr to 0 via the metadata
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
-; CHECK: NumVgprs: 2
+; CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
 .entry:
  ret { <4 x float> } undef
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -24,9 +24,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %

 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 {
+define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
  store i32 %x, ptr addrspace(1) %out, align 4
  ret void
 }
-
-attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -122,8 +122,8 @@ define void @test_func() !dbg !6 {
 }

 ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 22
-; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 3
+; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 4
+; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     Dynamic Stack: False
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -4,8 +4,8 @@
 ; CHECK-LABEL: {{^}}spill:
 ; GCN:    NumSgprs: 104
 ; GCN-GCNTRACKERS:    NumSgprs: 104
-; GCN:    NumVgprs: 3
-; GCN-GCNTRACKERS:    NumVgprs: 3
+; GCN:    NumVgprs: 1
+; GCN-GCNTRACKERS:    NumVgprs: 2
 ; GCN:    ScratchSize: 0
 ; GCN-GCNTRACKERS:    ScratchSize: 0
 ; GCN:    Occupancy: 5
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -11,8 +11,8 @@
 ; allow scheduling of other instructions which reduce RP

 ; CHECK-LABEL: {{^}}return_72xi32:
-; GFX11-PAL:    NumSgprs: 0
-; GFX11-PAL-GCNTRACKERS:    NumSgprs: 0
+; GFX11-PAL:    NumSgprs: 33
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 33
 ; GFX11-PAL:    NumVgprs: 64
 ; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
 ; GFX11-PAL:    ScratchSize: 220
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,14 +7,14 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s

-; SI-MINREG: NumSgprs: {{[1]?[1-9]$}}
-; SI-MINREG: NumVgprs: {{[1]?[1-9]$}}
+; SI-MINREG: NumSgprs: {{[1-9]$}}
+; SI-MINREG: NumVgprs: {{[1-9]$}}

 ; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}}
 ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}}

 ; stores may alias loads
-; VI-MINREG: NumSgprs: {{[1]?[0-9]$}}
+; VI-MINREG: NumSgprs: {{[0-9]$}}
 ; VI-MINREG: NumVgprs: {{[1-3][0-9]$}}

 ; stores may alias loads
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,7 +6,7 @@

 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 1

 ; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......

 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     12
+; ELF: .sgpr_count:     9
 entry:
  tail call void asm sideeffect "", "~{s[0:4]}"()
  ret void
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,7 +6,7 @@

 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 0

 ; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......

 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     8
+; ELF: .sgpr_count:     5
 entry:
  tail call void asm sideeffect "", "~{s[0:4]}"()
  ret void
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,7 +6,7 @@

 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 1

 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......

 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     12
+; ELF: .sgpr_count:     9
 entry:
  tail call void asm sideeffect "", "~{s[0:4]}"()
  ret void
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: __unnamed_1:
 ; CHECK: .set __unnamed_1.num_vgpr, 0
 ; CHECK: .set __unnamed_1.num_agpr, 0
-; CHECK: .set __unnamed_1.numbered_sgpr, 0
+; CHECK: .set __unnamed_1.numbered_sgpr, 32
 ; CHECK: .set __unnamed_1.private_seg_size, 0
 ; CHECK: .set __unnamed_1.uses_vcc, 0
 ; CHECK: .set __unnamed_1.uses_flat_scratch, 0
@@ -16,7 +16,7 @@ entry:
 }

 ; CHECK-LABEL: __unnamed_2:
-; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
+; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
 ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
 ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
 ; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
 }

 ; GCN-LABEL: {{^}}f1024_0:
-; GFX90A: NumVgprs: 1
+; GFX90A: NumVgprs: 32
 ; GFX90A: NumAgprs: 1
-; GFX90A: TotalNumVgprs: 5
+; GFX90A: TotalNumVgprs: 33
 define void @f1024_0() #1024 {
  call void @foo()
  ret void
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
@@ -1,30 +0,0 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
-; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
-target triple = "amdgcn-amd-amdhsa"
-
-@global = addrspace(1) global i32 poison, align 4
-
-; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
-; Only hardware-initialized VGPRs (v0) are read in this kernel.
-
-; CHECK-LABEL: amdhsa.kernels:
-; CHECK-LABEL: kernel_x
-; CHECK: .vgpr_count:     1
-define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: kernel_z
-; PACKED: .vgpr_count:     1
-; NOTPACKED: .vgpr_count:     3
-define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.z()
-  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
@@ -1,35 +0,0 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
-; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
-target triple = "amdgcn--amdpal"
-
-@global = addrspace(1) global i32 poison, align 4
-
-; CHECK-LABEL: amdpal.pipelines:
-
-; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
-; CHECK-LABEL: .entry_point_symbol: kernel_use
-; CHECK: .vgpr_count:     0x20
-define amdgpu_cs void @kernel_use([32 x i32] %args) {
-entry:
-  %a = extractvalue [32 x i32] %args, 14
-  store i32 %a, ptr addrspace(1) @global
-  ret void
-}
-
-; Neither uses not writes a VGPR
-; CHECK-LABEL: chain_func:
-; CHECK: .vgpr_count:     0x1
-define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
-entry:
-  call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
-        ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
-  unreachable
-}
-
-; Neither uses not writes a VGPR
-; CHECK-LABEL: gfx_func:
-; CHECK: .vgpr_count:     0x1
-define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
-entry:
-  ret [32 x i32] %args
-}