AMDGPU: Add gfx950 subtarget definitions (#116307)

Mostly a stub, but adds some baseline tests and tests for removed instructions.
2024-11-18 10:41:14 -08:00
parent 18be88e20a
commit a6fc489bb7
43 changed files with 5063 additions and 4435 deletions
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -712,6 +712,8 @@ Target Specific Changes
 AMDGPU Support
 ^^^^^^^^^^^^^^

+- Initial support for gfx950
+
 - Added headers ``gpuintrin.h`` and ``amdgpuintrin.h`` that contains common
  definitions for GPU builtin functions. This header can be included for OpenMP,
  CUDA, HIP, OpenCL, and C/C++.
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -107,6 +107,7 @@ enum class OffloadArch {
  GFX940,
  GFX941,
  GFX942,
+  GFX950,
  GFX10_1_GENERIC,
  GFX1010,
  GFX1011,
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -125,6 +125,7 @@ static const OffloadArchToStringMap arch_names[] = {
    GFX(940),  // gfx940
    GFX(941),  // gfx941
    GFX(942),  // gfx942
+    GFX(950),  // gfx950
    {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"},
    GFX(1010), // gfx1010
    GFX(1011), // gfx1011
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -209,6 +209,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
      case OffloadArch::GFX940:
      case OffloadArch::GFX941:
      case OffloadArch::GFX942:
+      case OffloadArch::GFX950:
      case OffloadArch::GFX10_1_GENERIC:
      case OffloadArch::GFX1010:
      case OffloadArch::GFX1011:
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -2304,6 +2304,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
      case OffloadArch::GFX940:
      case OffloadArch::GFX941:
      case OffloadArch::GFX942:
+      case OffloadArch::GFX950:
      case OffloadArch::GFX10_1_GENERIC:
      case OffloadArch::GFX1010:
      case OffloadArch::GFX1011:
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -32,6 +32,7 @@
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s
+// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s
 // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s
@@ -88,6 +89,7 @@
 // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
--- a/clang/test/Driver/amdgpu-macros.cl
+++ b/clang/test/Driver/amdgpu-macros.cl
@@ -110,6 +110,7 @@
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx940 -DFAMILY=GFX9
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx941 -DFAMILY=GFX9
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx942 -DFAMILY=GFX9
+// RUN: %clang -E -dM -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx950 -DFAMILY=GFX9
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010 -DFAMILY=GFX10
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1011 -DFAMILY=GFX10
 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1012 -DFAMILY=GFX10
--- a/clang/test/Driver/amdgpu-mcpu.cl
+++ b/clang/test/Driver/amdgpu-mcpu.cl
@@ -95,6 +95,7 @@
 // RUN: %clang -### -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefix=GFX941 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefix=GFX942 %s
+// RUN: %clang -### -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefix=GFX950 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefix=GFX1011 %s
 // RUN: %clang -### -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefix=GFX1012 %s
@@ -150,6 +151,7 @@
 // GFX940:    "-target-cpu" "gfx940"
 // GFX941:    "-target-cpu" "gfx941"
 // GFX942:    "-target-cpu" "gfx942"
+// GFX950:    "-target-cpu" "gfx950"
 // GFX1010:   "-target-cpu" "gfx1010"
 // GFX1011:   "-target-cpu" "gfx1011"
 // GFX1012:   "-target-cpu" "gfx1012"
--- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
+++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c
@@ -48,6 +48,7 @@
 // CHECK-SAME: {{^}}, gfx940
 // CHECK-SAME: {{^}}, gfx941
 // CHECK-SAME: {{^}}, gfx942
+// CHECK-SAME: {{^}}, gfx950
 // CHECK-SAME: {{^}}, gfx1010
 // CHECK-SAME: {{^}}, gfx1011
 // CHECK-SAME: {{^}}, gfx1012
--- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c
+++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c
@@ -54,6 +54,7 @@
 // CHECK-SAME: {{^}}, gfx940
 // CHECK-SAME: {{^}}, gfx941
 // CHECK-SAME: {{^}}, gfx942
+// CHECK-SAME: {{^}}, gfx950
 // CHECK-SAME: {{^}}, gfx10-1-generic
 // CHECK-SAME: {{^}}, gfx1010
 // CHECK-SAME: {{^}}, gfx1011
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -399,6 +399,13 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                        work-item
                                                                        IDs

+     ``gfx950``                  ``amdgcn``   dGPU  - sramecc         - Architected                   *TBA*
+                                                    - tgsplit           flat
+                                                    - xnack             scratch                       .. TODO::
+                                                    - kernarg preload - Packed
+                                                                        work-item                       Add product
+                                                                        IDs                             names.
+
     **GCN GFX10.1 (RDNA 1)** [AMD-GCN-GFX10-RDNA1]_
     -----------------------------------------------------------------------------------------------------------------------
     ``gfx1010``                 ``amdgcn``   dGPU  - cumode          - Absolute      - *rocm-amdhsa* - Radeon RX 5700
@@ -2178,7 +2185,7 @@ The AMDGPU backend uses the following ELF header:
     ``EF_AMDGPU_MACH_AMDGCN_GFX942``           0x04c      ``gfx942``
     *reserved*                                 0x04d      Reserved.
     ``EF_AMDGPU_MACH_AMDGCN_GFX1201``          0x04e      ``gfx1201``
-     *reserved*                                 0x04f      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX950``           0x04f      ``gfx950``
     *reserved*                                 0x050      Reserved.
     ``EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC``     0x051      ``gfx9-generic``
     ``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC``  0x052      ``gfx10-1-generic``
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -811,7 +811,7 @@ enum : unsigned {
  EF_AMDGPU_MACH_AMDGCN_GFX942          = 0x04c,
  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D   = 0x04d,
  EF_AMDGPU_MACH_AMDGCN_GFX1201         = 0x04e,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F   = 0x04f,
+  EF_AMDGPU_MACH_AMDGCN_GFX950          = 0x04f,
  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50   = 0x050,
  EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC    = 0x051,
  EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052,
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -86,18 +86,19 @@ enum GPUKind : uint32_t {
  GK_GFX940 = 68,
  GK_GFX941 = 69,
  GK_GFX942 = 70,
+  GK_GFX950 = 71,

-  GK_GFX1010 = 71,
-  GK_GFX1011 = 72,
-  GK_GFX1012 = 73,
-  GK_GFX1013 = 74,
-  GK_GFX1030 = 75,
-  GK_GFX1031 = 76,
-  GK_GFX1032 = 77,
-  GK_GFX1033 = 78,
-  GK_GFX1034 = 79,
-  GK_GFX1035 = 80,
-  GK_GFX1036 = 81,
+  GK_GFX1010 = 72,
+  GK_GFX1011 = 73,
+  GK_GFX1012 = 74,
+  GK_GFX1013 = 75,
+  GK_GFX1030 = 76,
+  GK_GFX1031 = 77,
+  GK_GFX1032 = 78,
+  GK_GFX1033 = 79,
+  GK_GFX1034 = 80,
+  GK_GFX1035 = 81,
+  GK_GFX1036 = 82,

  GK_GFX1100 = 90,
  GK_GFX1101 = 91,
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -550,6 +550,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
    return "gfx941";
  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942:
    return "gfx942";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950:
+    return "gfx950";

  // AMDGCN GFX10.
  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010:
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -609,6 +609,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH);
    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH);
    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH);
    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);
    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH);
    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH);
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -360,6 +360,12 @@ def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
  "Additional instructions for GFX940+"
 >;

+def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
+  "GFX950Insts",
+  "true",
+  "Additional instructions for GFX950+"
+>;
+
 def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
  "GFX10Insts",
  "true",
@@ -1470,6 +1476,14 @@ def FeatureISAVersion9_4_Common : FeatureSet<
   FeatureFlatBufferGlobalAtomicFaddF64Inst
   ]>;

+def FeatureISAVersion9_5_Common : FeatureSet<
+  !listconcat(FeatureISAVersion9_4_Common.Features,
+  [FeatureFP8Insts,
+   FeatureFP8ConversionInsts,
+   FeatureCvtFP8VOP1Bug,
+   FeatureGFX950Insts
+   ])>;
+
 def FeatureISAVersion9_4_0 : FeatureSet<
  !listconcat(FeatureISAVersion9_4_Common.Features,
    [
@@ -1503,6 +1517,8 @@ def FeatureISAVersion9_4_Generic : FeatureSet<
  !listconcat(FeatureISAVersion9_4_Common.Features,
    [FeatureRequiresCOV6])>;

+def FeatureISAVersion9_5_0 : FeatureSet<FeatureISAVersion9_5_Common.Features>;
+
 def FeatureISAVersion10_Common : FeatureSet<
  [FeatureGFX10,
   FeatureLDSBankCount32,
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -204,6 +204,10 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
  FeatureISAVersion9_4_2.Features
 >;

+def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel,
+  FeatureISAVersion9_5_0.Features
+>;
+
 // [gfx900, gfx902, gfx904, gfx906, gfx909, gfx90c]
 def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel,
  FeatureISAVersion9_Generic.Features
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -106,6 +106,7 @@ protected:
  bool GFX9Insts = false;
  bool GFX90AInsts = false;
  bool GFX940Insts = false;
+  bool GFX950Insts = false;
  bool GFX10Insts = false;
  bool GFX11Insts = false;
  bool GFX12Insts = false;
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -96,6 +96,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940:  AK = GK_GFX940;  break;
  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941:  AK = GK_GFX941;  break;
  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942:  AK = GK_GFX942;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950:  AK = GK_GFX950;  break;
  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
@@ -182,6 +183,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
  case GK_GFX940:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940;
  case GK_GFX941:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941;
  case GK_GFX942:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942;
+  case GK_GFX950:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950;
  case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
  case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
  case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -107,6 +107,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
    {{"gfx940"},    {"gfx940"},  GK_GFX940,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
    {{"gfx941"},    {"gfx941"},  GK_GFX941,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
    {{"gfx942"},    {"gfx942"},  GK_GFX942,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
+    {{"gfx950"},    {"gfx950"},  GK_GFX950,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
    {{"gfx1010"},   {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
    {{"gfx1011"},   {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
    {{"gfx1012"},   {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
@@ -262,6 +263,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
  case GK_GFX940:  return {9, 4, 0};
  case GK_GFX941:  return {9, 4, 1};
  case GK_GFX942:  return {9, 4, 2};
+  case GK_GFX950:  return {9, 5, 0};
  case GK_GFX1010: return {10, 1, 0};
  case GK_GFX1011: return {10, 1, 1};
  case GK_GFX1012: return {10, 1, 2};
@@ -361,7 +363,8 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
    Features["wavefrontsize32"] = true;
    Features["wavefrontsize64"] = true;
  } else if (T.isAMDGCN()) {
-    switch (parseArchAMDGCN(GPU)) {
+    AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU);
+    switch (Kind) {
    case GK_GFX1201:
    case GK_GFX1200:
    case GK_GFX12_GENERIC:
@@ -466,12 +469,16 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
      Features["s-memtime-inst"] = true;
      Features["gws"] = true;
      break;
+    case GK_GFX950:
+      Features["gfx950-insts"] = true;
+      [[fallthrough]];
    case GK_GFX942:
    case GK_GFX941:
    case GK_GFX940:
      Features["fp8-insts"] = true;
      Features["fp8-conversion-insts"] = true;
-      Features["xf32-insts"] = true;
+      if (Kind != GK_GFX950)
+        Features["xf32-insts"] = true;
      [[fallthrough]];
    case GK_GFX9_4_GENERIC:
      Features["gfx940-insts"] = true;
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN,GFX-940 %s
+; RUN: llc -march=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s

 ; TODO: Add global-isel when it can support bf16

@@ -198,19 +199,33 @@ entry:
 }

 define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
-; GCN-LABEL: fptrunc_f32_to_bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v1, v1, v0, s0
-; GCN-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f32_to_bf16:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-940-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX-940-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX-940-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f32_to_bf16:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-950-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX-950-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-950-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX-950-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX-950-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX-950-NEXT:    s_nop 1
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
+; GFX-950-NEXT:    flat_store_short_d16_hi v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
  %a.cvt = fptrunc float %a to bfloat
  store bfloat %a.cvt, ptr %out
@@ -218,20 +233,35 @@ entry:
 }

 define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
-; GCN-LABEL: fptrunc_f32_to_bf16_abs:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
-; GCN-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v1, s0
-; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
-; GCN-NEXT:    v_cmp_u_f32_e64 vcc, |v0|, |v0|
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f32_to_bf16_abs:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-940-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
+; GFX-940-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GFX-940-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, |v0|, |v0|
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f32_to_bf16_abs:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-950-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v0
+; GFX-950-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX-950-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-950-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GFX-950-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX-950-NEXT:    v_cmp_u_f32_e64 vcc, |v0|, |v0|
+; GFX-950-NEXT:    s_nop 1
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX-950-NEXT:    flat_store_short_d16_hi v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
  %a.abs = call float @llvm.fabs.f32(float %a)
  %a.cvt = fptrunc float %a.abs to bfloat
@@ -240,20 +270,35 @@ entry:
 }

 define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
-; GCN-LABEL: fptrunc_f32_to_bf16_neg:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_mov_b32_e32 v3, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
-; GCN-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v1, s0
-; GCN-NEXT:    v_or_b32_e32 v1, 0x400000, v1
-; GCN-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f32_to_bf16_neg:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-940-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-940-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GFX-940-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GFX-940-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX-940-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f32_to_bf16_neg:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX-950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX-950-NEXT:    v_xor_b32_e32 v1, 0x80000000, v0
+; GFX-950-NEXT:    v_bfe_u32 v4, v1, 16, 1
+; GFX-950-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-950-NEXT:    v_add3_u32 v4, v4, v1, s0
+; GFX-950-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX-950-NEXT:    v_cmp_u_f32_e64 vcc, -v0, -v0
+; GFX-950-NEXT:    s_nop 1
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX-950-NEXT:    flat_store_short_d16_hi v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
  %a.neg = fneg float %a
  %a.cvt = fptrunc float %a.neg to bfloat
@@ -262,29 +307,53 @@ entry:
 }

 define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
-; GCN-LABEL: fptrunc_f64_to_bf16:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GCN-NEXT:    v_and_b32_e32 v7, 1, v6
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v4, v6, v4
-; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT:    s_brev_b32 s0, 1
-; GCN-NEXT:    v_and_or_b32 v5, v1, s0, v4
-; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
-; GCN-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f64_to_bf16:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX-940-NEXT:    v_and_b32_e32 v7, 1, v6
+; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-940-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX-940-NEXT:    s_brev_b32 s0, 1
+; GFX-940-NEXT:    v_and_or_b32 v5, v1, s0, v4
+; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-940-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f64_to_bf16:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX-950-NEXT:    v_and_b32_e32 v7, 1, v6
+; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-950-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX-950-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-950-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-950-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX-950-NEXT:    s_brev_b32 s0, 1
+; GFX-950-NEXT:    v_and_or_b32 v5, v1, s0, v4
+; GFX-950-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-950-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-950-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-950-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-950-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX-950-NEXT:    s_nop 1
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-950-NEXT:    flat_store_short_d16_hi v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
  %a.cvt = fptrunc double %a to bfloat
  store bfloat %a.cvt, ptr %out
@@ -292,30 +361,55 @@ entry:
 }

 define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
-; GCN-LABEL: fptrunc_f64_to_bf16_neg:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
-; GCN-NEXT:    v_and_b32_e32 v8, 1, v7
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v4, v7, v4
-; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT:    s_brev_b32 s4, 1
-; GCN-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GCN-NEXT:    v_and_or_b32 v5, v6, s4, v4
-; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
-; GCN-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f64_to_bf16_neg:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GFX-940-NEXT:    v_and_b32_e32 v8, 1, v7
+; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-940-NEXT:    v_add_u32_e32 v4, v7, v4
+; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-940-NEXT:    s_brev_b32 s4, 1
+; GFX-940-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX-940-NEXT:    v_and_or_b32 v5, v6, s4, v4
+; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-940-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f64_to_bf16_neg:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GFX-950-NEXT:    v_and_b32_e32 v8, 1, v7
+; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-950-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX-950-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-950-NEXT:    v_add_u32_e32 v4, v7, v4
+; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-950-NEXT:    s_brev_b32 s4, 1
+; GFX-950-NEXT:    v_xor_b32_e32 v6, 0x80000000, v1
+; GFX-950-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX-950-NEXT:    v_and_or_b32 v5, v6, s4, v4
+; GFX-950-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-950-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-950-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-950-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-950-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
+; GFX-950-NEXT:    s_nop 1
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-950-NEXT:    flat_store_short_d16_hi v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
  %a.neg = fneg double %a
  %a.cvt = fptrunc double %a.neg to bfloat
@@ -324,30 +418,55 @@ entry:
 }

 define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
-; GCN-LABEL: fptrunc_f64_to_bf16_abs:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
-; GCN-NEXT:    v_and_b32_e32 v8, 1, v7
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GCN-NEXT:    v_add_u32_e32 v4, v7, v4
-; GCN-NEXT:    s_or_b64 vcc, s[0:1], vcc
-; GCN-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GCN-NEXT:    s_brev_b32 s0, 1
-; GCN-NEXT:    v_and_or_b32 v5, v6, s0, v4
-; GCN-NEXT:    v_bfe_u32 v4, v4, 16, 1
-; GCN-NEXT:    s_movk_i32 s0, 0x7fff
-; GCN-NEXT:    v_add3_u32 v4, v4, v5, s0
-; GCN-NEXT:    v_or_b32_e32 v5, 0x400000, v5
-; GCN-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GCN-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
-; GCN-NEXT:    s_endpgm
+; GFX-940-LABEL: fptrunc_f64_to_bf16_abs:
+; GFX-940:       ; %bb.0: ; %entry
+; GFX-940-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GFX-940-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GFX-940-NEXT:    v_and_b32_e32 v8, 1, v7
+; GFX-940-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-940-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX-940-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-940-NEXT:    v_add_u32_e32 v4, v7, v4
+; GFX-940-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-940-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX-940-NEXT:    s_brev_b32 s0, 1
+; GFX-940-NEXT:    v_and_or_b32 v5, v6, s0, v4
+; GFX-940-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-940-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-940-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-940-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-940-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
+; GFX-940-NEXT:    s_nop 1
+; GFX-940-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-940-NEXT:    flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GFX-940-NEXT:    s_endpgm
+;
+; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
+; GFX-950:       ; %bb.0: ; %entry
+; GFX-950-NEXT:    v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GFX-950-NEXT:    v_cvt_f64_f32_e32 v[4:5], v7
+; GFX-950-NEXT:    v_and_b32_e32 v8, 1, v7
+; GFX-950-NEXT:    v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GFX-950-NEXT:    v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-950-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX-950-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GFX-950-NEXT:    v_add_u32_e32 v4, v7, v4
+; GFX-950-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX-950-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
+; GFX-950-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX-950-NEXT:    s_brev_b32 s0, 1
+; GFX-950-NEXT:    v_and_or_b32 v5, v6, s0, v4
+; GFX-950-NEXT:    v_bfe_u32 v4, v4, 16, 1
+; GFX-950-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX-950-NEXT:    v_add3_u32 v4, v4, v5, s0
+; GFX-950-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX-950-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
+; GFX-950-NEXT:    s_nop 1
+; GFX-950-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX-950-NEXT:    flat_store_short_d16_hi v[2:3], v0
+; GFX-950-NEXT:    s_endpgm
 entry:
  %a.abs = call double @llvm.fabs.f64(double %a)
  %a.cvt = fptrunc double %a.abs to bfloat
--- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -80,6 +80,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX942-NOXNACK %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX942-XNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX950-NOXNACK %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX950-XNACK %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX1010 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX1010-NOXNACK %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX1010-XNACK %s
@@ -180,6 +183,9 @@
 ; GFX942: .amdgcn_target "amdgcn-amd-amdhsa--gfx942"
 ; GFX942-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack-"
 ; GFX942-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack+"
+; GFX950: .amdgcn_target "amdgcn-amd-amdhsa--gfx950"
+; GFX950-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack-"
+; GFX950-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack+"
 ; GFX1010: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010"
 ; GFX1010-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack-"
 ; GFX1010-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack+"
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -57,6 +57,7 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX940 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx941 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX941 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx942 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX942 %s
+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX950 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1010 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1011 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1012 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1012 %s
@@ -139,6 +140,7 @@
 ; GFX940:        EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40)
 ; GFX941:        EF_AMDGPU_MACH_AMDGCN_GFX941 (0x4B)
 ; GFX942:        EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C)
+; GFX950:        EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
 ; GFX1010:       EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
 ; GFX1011:       EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34)
 ; GFX1012:       EF_AMDGPU_MACH_AMDGCN_GFX1012 (0x35)
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
@@ -12,6 +12,9 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s

+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s
+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s
+
 ; NO-SRAM-ECC-GFX906:      Flags [
 ; NO-SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_FEATURE_XNACK_V3   (0x100)
 ; NO-SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
@@ -44,6 +47,11 @@
 ; SRAM-ECC-GFX940:    EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40)
 ; SRAM-ECC-GFX940:  ]

+; SRAM-ECC-GFX950: Flags [
+; SRAM-ECC-GFX950:    EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200)
+; SRAM-ECC-GFX950:    EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
+; SRAM-ECC-GFX950:  ]
+
 define amdgpu_kernel void @elf_header() {
  ret void
 }
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s

--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll
--- a/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s
+++ b/llvm/test/MC/AMDGPU/flat-scratch-gfx940.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck -check-prefix=GFX940 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX940 %s

 scratch_load_dword a2, v4, s6
 // GFX940: scratch_load_dword a2, v4, s6           ; encoding: [0x00,0x60,0x50,0xdc,0x04,0x00,0x86,0x02]
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX90A --implicit-check-not=error: %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX10 --implicit-check-not=error: %s

--- a/llvm/test/MC/AMDGPU/gfx950-unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx950-unsupported.s
@@ -0,0 +1,179 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck -check-prefix=ERR %s
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_32x32x4_xf32
+//===----------------------------------------------------------------------===//
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[2:3], v[4:5], a[2:5]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], a[0:3], v[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[0:3], a[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[0:3], v[0:3], a[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], a[0:3], a[0:3], v[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[2:3], v[4:5], a[2:5]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], a[0:3], v[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[0:3], a[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  a[0:3], v[0:3], v[0:3], a[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_32x32x4_xf32  v[0:3], a[0:3], a[0:3], v[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+
+//===----------------------------------------------------------------------===//
+// v_mfma_f32_16x16x8_xf32
+//===----------------------------------------------------------------------===//
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
+// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
--- a/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt
+++ b/llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx950 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX950 %s
+
+# GFX950: warning: invalid instruction encoding
+0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX950: warning: invalid instruction encoding
+0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX950: warning: invalid instruction encoding
+0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04
+
+# GFX950: warning: invalid instruction encoding
+0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt
@@ -1,4 +1,5 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s

 # GFX940: global_load_dword v2, v[2:3], off sc0   ; encoding: [0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02]
 0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02
--- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -162,6 +162,10 @@
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX942 %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX942 %s

+# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX950/' %s | yaml2obj -o %t.o.AMDGCN_GFX950
+# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX950 %s
+# RUN: obj2yaml %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX950 %s
+
 # RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX1010/' %s | yaml2obj -o %t.o.AMDGCN_GFX1010
 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1010 %s
 # RUN: obj2yaml %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1010 %s
@@ -411,6 +415,9 @@
 # ELF-AMDGCN-GFX942:    EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C)
 # YAML-AMDGCN-GFX942:   Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX942 ]

+# ELF-AMDGCN-GFX950:    EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
+# YAML-AMDGCN-GFX950:   Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX950 ]
+
 # ELF-AMDGCN-GFX1010:   EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
 # YAML-AMDGCN-GFX1010:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1010 ]

--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
@@ -137,7 +137,6 @@ define amdgpu_kernel void @test_kernel() {

 ; ----------------------------------GFX9---------------------------------------
 ;
-
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=6 -mcpu=gfx9-4-generic -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn -mllvm --amdhsa-code-object-version=6 --mcpu=gfx9-4-generic %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
@@ -148,6 +147,11 @@ define amdgpu_kernel void @test_kernel() {
 ; RUN: llvm-objdump -D  -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt
 ; RUN: diff %t-specify.txt %t-detect.txt

+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx950 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj -O0 -o %t.o %s
 ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx942 %t.o > %t-specify.txt
 ; RUN: llvm-objdump -D %t.o > %t-detect.txt
--- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test
@@ -223,6 +223,15 @@
 # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX942
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX942 -DFLAG_VALUE=0x4C

+# RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F
+
+# RUN: yaml2obj %s -o %t -DABI_VERSION=1 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=1 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F
+
+# RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950
+# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX950 -DFLAG_VALUE=0x4F
+
 # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1010
 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1010 -DFLAG_VALUE=0x33

--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1619,6 +1619,7 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"),                            \
  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"),                            \
  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"),                            \
+  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX950, "gfx950"),                            \
  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"),                          \
  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1011, "gfx1011"),                          \
  ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1012, "gfx1012"),                          \
--- a/offload/DeviceRTL/CMakeLists.txt
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -43,7 +43,7 @@ set(include_directory ${devicertl_base_directory}/include)
 set(source_directory ${devicertl_base_directory}/src)

 set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx1010"
+                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx950;gfx1010"
                             "gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035"
                             "gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150"
                             "gfx1151;gfx1152;gfx1153")